mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-26 16:07:54 +00:00
add gnuparallel python package.
This commit is contained in:
parent
141eb4d5ea
commit
b2f8450bf3
6
src/optional/genresults.sh
Executable file
6
src/optional/genresults.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Generate the result files used to test the query modules.
|
||||||
|
|
||||||
|
../parallel --header : --result testresults/foo_ echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
|
||||||
|
../parallel --header : --result testresults/bar_ echo {a} {b} ::: a 5 6 ::: b 0.70 0.80
|
4
src/optional/python/.gitignore
vendored
Normal file
4
src/optional/python/.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
*.pyc
|
||||||
|
build
|
||||||
|
dist
|
||||||
|
*.egg-info
|
9
src/optional/python/README
Normal file
9
src/optional/python/README
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
gnuparallel : Simple loading of GNU parallel result files.
|
||||||
|
|
||||||
|
The gnuparallel package provides a single function, `load`, which
|
||||||
|
loads results from files generated by GNU parallel into a Pandas
|
||||||
|
DataFrame object. See `help(gnuparallel.load)` for details.
|
||||||
|
|
||||||
|
Installation:
|
||||||
|
|
||||||
|
`python setup.py install`
|
3
src/optional/python/gnuparallel/__init__.py
Normal file
3
src/optional/python/gnuparallel/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
"""Load GNU Parallel --results files into a Pandas DataFrame."""
|
||||||
|
|
||||||
|
from _loader import *
|
143
src/optional/python/gnuparallel/_loader.py
Executable file
143
src/optional/python/gnuparallel/_loader.py
Executable file
|
@ -0,0 +1,143 @@
|
||||||
|
"""
|
||||||
|
A function for loading the --result files generated by GNU Parallel.
|
||||||
|
"""
|
||||||
|
__all__ = ['load']
|
||||||
|
|
||||||
|
from cStringIO import StringIO
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
def load(_dir, _process=None, _format=None, _stream='stdout',
|
||||||
|
_prefix=None, _infer_types=True, **options):
|
||||||
|
"""Load files generated with parallel's --result option.
|
||||||
|
|
||||||
|
One use of GNU parallel is to call one command many times, each
|
||||||
|
time with a different set of arguments. With the --result option,
|
||||||
|
parallel will capture stdout and stderr from these processes and
|
||||||
|
store them in files named according to the arguments of each
|
||||||
|
individual call. This function provides easy loading of these
|
||||||
|
result files into a Pandas DataFrame.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
_dir : str
|
||||||
|
Directory containing the results files.
|
||||||
|
_process : function, optional
|
||||||
|
Function that opens a results file and returns an object containing
|
||||||
|
its results. If not provided, the resulting data frame will include
|
||||||
|
a column containing the file names, not the actual results.
|
||||||
|
|
||||||
|
If provided, the function should take a filename as its sole parameter.
|
||||||
|
Whatever the function returns will be stored in the "res" column of
|
||||||
|
the resulting DataFrame.
|
||||||
|
_format : dict, optional
|
||||||
|
Dictionary of format strings, used to convert any provided filter
|
||||||
|
values to a format matching the results file names.
|
||||||
|
|
||||||
|
For example, if the `foo` parameter to parallel was "0.10" and you pass
|
||||||
|
foo=0.10 as an option, you will not find the intended file because
|
||||||
|
str(0.10) == "0.1". To fix this, you should also include the key-value
|
||||||
|
pair "foo": "%.2f" in the _format dict. This is usually only necessary
|
||||||
|
for float-valued arguments where rounding or precision issues might
|
||||||
|
affect the matching process.
|
||||||
|
_stream : str, optional
|
||||||
|
Specify either "stdout" or "stderr" to load results files from the
|
||||||
|
corresponding stream. Default is "stdout".
|
||||||
|
_prefix : str, optional
|
||||||
|
Only load result files with a specific prefix. When using the --result
|
||||||
|
option to parallel it is possible to specify a prefix for all of the
|
||||||
|
result files. For example,
|
||||||
|
parallel --result /some/dir/a_prefix ...
|
||||||
|
would place all result files into the `/some/dir` directory and all of
|
||||||
|
the file names would begin with "a_prefix". This parameter lets you
|
||||||
|
filter based on this prefix. If None, allow any prefix. Default None.
|
||||||
|
_infer_types : bool, optional
|
||||||
|
Infer data types for option values. All option values are techinically
|
||||||
|
strings (since they were passed on the command line). When _infer_types
|
||||||
|
is True, the resulting DataFrame will convert these values to inferred
|
||||||
|
dtypes, e.g. the number 1 instead of "1". Default True.
|
||||||
|
**options : kwargs
|
||||||
|
Additional keyword arguments that will be used to filter the subset
|
||||||
|
of results included in the output. The values can be either single
|
||||||
|
values or iterables. If they are iterable, files corresponding to any
|
||||||
|
of the included values will be considered a match.
|
||||||
|
|
||||||
|
For example, passing `foo=[1,2,3]` will include results from files
|
||||||
|
corresponding to runs where the parallel argument named `foo` had
|
||||||
|
the value "1", "2", or "3".
|
||||||
|
|
||||||
|
See also the _format parameter.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
res : pandas.DataFrame
|
||||||
|
A DataFrame with one column named for each of the parallel arguments
|
||||||
|
and, depending on the _process argument, either:
|
||||||
|
- A "res" column containing the results corresponding to each run.
|
||||||
|
- A "resfile" column containing the names of the results files.
|
||||||
|
"""
|
||||||
|
if _format is None:
|
||||||
|
_format = dict()
|
||||||
|
|
||||||
|
# Process the filter options.
|
||||||
|
for k,v in options.iteritems():
|
||||||
|
if hasattr(v, '__iter__') and not isinstance(v, basestring):
|
||||||
|
pass # v is already a container type.
|
||||||
|
else:
|
||||||
|
options[k] = [v]
|
||||||
|
for k,v in options.iteritems():
|
||||||
|
options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
|
||||||
|
options['_stream'] = [_stream]
|
||||||
|
if _prefix:
|
||||||
|
options['_prefix'] = [_prefix]
|
||||||
|
|
||||||
|
# Iterate over results files and collect the matches.
|
||||||
|
matches = []
|
||||||
|
for file in os.listdir(_dir):
|
||||||
|
metadata = _parse_name(file)
|
||||||
|
metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
|
||||||
|
if _select(metadata, options):
|
||||||
|
matches.append(metadata)
|
||||||
|
|
||||||
|
# Create a DataFrame from the matches.
|
||||||
|
df = pd.DataFrame(matches)
|
||||||
|
if _process and not df.empty:
|
||||||
|
df['res'] = df.resfile.apply(_process)
|
||||||
|
df = df.drop('resfile', axis=1)
|
||||||
|
|
||||||
|
# Optionally try to convert string argument values to numeric types.
|
||||||
|
if _infer_types:
|
||||||
|
buf = StringIO()
|
||||||
|
df.to_csv(buf)
|
||||||
|
df = pd.read_csv(StringIO(buf.getvalue()), index_col=0)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def _parse_name(file, sep='\t'):
|
||||||
|
"""Return a dict containing metadata extracted from the file name."""
|
||||||
|
tokens = file.split(sep)
|
||||||
|
prefix_stream = tokens[0]
|
||||||
|
metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
|
||||||
|
|
||||||
|
stream_index = prefix_stream.find('stdout')
|
||||||
|
if stream_index == -1:
|
||||||
|
stream_index = prefix_stream.find('stderr')
|
||||||
|
prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
|
||||||
|
|
||||||
|
metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def _select(metadata, filter):
|
||||||
|
"""Return true if the metadata entry matches the filter, False otherwise."""
|
||||||
|
if any(k not in metadata for k in filter):
|
||||||
|
return False
|
||||||
|
if any(all(v != metadata[k] for v in vs) for k,vs in filter.iteritems()):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _stringify(x, fmt):
|
||||||
|
"""Return the string representation of x, using a format string if provided"""
|
||||||
|
if fmt:
|
||||||
|
return fmt % x
|
||||||
|
else:
|
||||||
|
return str(x)
|
15
src/optional/python/setup.py
Executable file
15
src/optional/python/setup.py
Executable file
|
@ -0,0 +1,15 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from distutils.core import setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name = 'gnuparallel',
|
||||||
|
version = '0.1',
|
||||||
|
description = 'Load GNU parallel result files.',
|
||||||
|
author = 'Drew Frank',
|
||||||
|
author_email = 'drewfrank@gmail.com',
|
||||||
|
packages = [
|
||||||
|
'gnuparallel'
|
||||||
|
],
|
||||||
|
install_requires = ['pandas']
|
||||||
|
)
|
66
src/optional/python/tests/test_loader.py
Normal file
66
src/optional/python/tests/test_loader.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import pandas as pd
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from gnuparallel import load
|
||||||
|
|
||||||
|
result_dir = '../../testresults'
|
||||||
|
|
||||||
|
class TestLoader(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_basics(self):
|
||||||
|
df = load(result_dir)
|
||||||
|
self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
|
||||||
|
self.assertEqual(df.shape[0], 8)
|
||||||
|
|
||||||
|
def test_prefix(self):
|
||||||
|
df = load(result_dir, _prefix='foo_')
|
||||||
|
self.assertEqual(df.shape[0], 4)
|
||||||
|
self.assertEqual(df.a.sum(), 6)
|
||||||
|
|
||||||
|
df = load(result_dir, _prefix='bar_')
|
||||||
|
self.assertEqual(df.shape[0], 4)
|
||||||
|
self.assertEqual(df.a.sum(), 22)
|
||||||
|
|
||||||
|
df = load(result_dir, _prefix='BAD')
|
||||||
|
self.assertTrue(df.empty)
|
||||||
|
|
||||||
|
def test_filters(self):
|
||||||
|
df = load(result_dir, a=2)
|
||||||
|
self.assertEqual(df.shape[0], 2)
|
||||||
|
self.assertEqual(df.a.sum(), 4)
|
||||||
|
|
||||||
|
df = load(result_dir, a=[2])
|
||||||
|
self.assertEqual(df.shape[0], 2)
|
||||||
|
self.assertEqual(df.a.sum(), 4)
|
||||||
|
|
||||||
|
df = load(result_dir, a=[1,2])
|
||||||
|
self.assertEqual(df.shape[0], 4)
|
||||||
|
self.assertEqual(df.a.sum(), 6)
|
||||||
|
|
||||||
|
df = load(result_dir, a=1000)
|
||||||
|
self.assertTrue(df.empty)
|
||||||
|
|
||||||
|
def test_infer_types(self):
|
||||||
|
df = load(result_dir)
|
||||||
|
self.assertEqual(df.a.dtype, pd.np.int64)
|
||||||
|
|
||||||
|
df = load(result_dir, _infer_types=False)
|
||||||
|
self.assertEqual(df.a.dtype, pd.np.object_)
|
||||||
|
|
||||||
|
def test_format(self):
|
||||||
|
df = load(result_dir, b=0.3)
|
||||||
|
self.assertTrue(df.empty)
|
||||||
|
|
||||||
|
df = load(result_dir, b=0.3, _format={'b': '%.2f'})
|
||||||
|
self.assertEqual(df.shape[0], 2)
|
||||||
|
|
||||||
|
def test_stream(self):
|
||||||
|
df = load(result_dir, _stream='stderr')
|
||||||
|
self.assertTrue((df._stream == 'stderr').all())
|
||||||
|
|
||||||
|
def test_process(self):
|
||||||
|
df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
|
||||||
|
self.assertAlmostEqual(df.res[0], 1.4)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
0
src/optional/testresults/bar_stderr a 5 b 0.70
Normal file
0
src/optional/testresults/bar_stderr a 5 b 0.70
Normal file
0
src/optional/testresults/bar_stderr a 5 b 0.80
Normal file
0
src/optional/testresults/bar_stderr a 5 b 0.80
Normal file
0
src/optional/testresults/bar_stderr a 6 b 0.70
Normal file
0
src/optional/testresults/bar_stderr a 6 b 0.70
Normal file
0
src/optional/testresults/bar_stderr a 6 b 0.80
Normal file
0
src/optional/testresults/bar_stderr a 6 b 0.80
Normal file
1
src/optional/testresults/bar_stdout a 5 b 0.70
Normal file
1
src/optional/testresults/bar_stdout a 5 b 0.70
Normal file
|
@ -0,0 +1 @@
|
||||||
|
5 0.70
|
1
src/optional/testresults/bar_stdout a 5 b 0.80
Normal file
1
src/optional/testresults/bar_stdout a 5 b 0.80
Normal file
|
@ -0,0 +1 @@
|
||||||
|
5 0.80
|
1
src/optional/testresults/bar_stdout a 6 b 0.70
Normal file
1
src/optional/testresults/bar_stdout a 6 b 0.70
Normal file
|
@ -0,0 +1 @@
|
||||||
|
6 0.70
|
1
src/optional/testresults/bar_stdout a 6 b 0.80
Normal file
1
src/optional/testresults/bar_stdout a 6 b 0.80
Normal file
|
@ -0,0 +1 @@
|
||||||
|
6 0.80
|
0
src/optional/testresults/foo_stderr a 1 b 0.30
Normal file
0
src/optional/testresults/foo_stderr a 1 b 0.30
Normal file
0
src/optional/testresults/foo_stderr a 1 b 0.40
Normal file
0
src/optional/testresults/foo_stderr a 1 b 0.40
Normal file
0
src/optional/testresults/foo_stderr a 2 b 0.30
Normal file
0
src/optional/testresults/foo_stderr a 2 b 0.30
Normal file
0
src/optional/testresults/foo_stderr a 2 b 0.40
Normal file
0
src/optional/testresults/foo_stderr a 2 b 0.40
Normal file
1
src/optional/testresults/foo_stdout a 1 b 0.30
Normal file
1
src/optional/testresults/foo_stdout a 1 b 0.30
Normal file
|
@ -0,0 +1 @@
|
||||||
|
1 0.30
|
1
src/optional/testresults/foo_stdout a 1 b 0.40
Normal file
1
src/optional/testresults/foo_stdout a 1 b 0.40
Normal file
|
@ -0,0 +1 @@
|
||||||
|
1 0.40
|
1
src/optional/testresults/foo_stdout a 2 b 0.30
Normal file
1
src/optional/testresults/foo_stdout a 2 b 0.30
Normal file
|
@ -0,0 +1 @@
|
||||||
|
2 0.30
|
1
src/optional/testresults/foo_stdout a 2 b 0.40
Normal file
1
src/optional/testresults/foo_stdout a 2 b 0.40
Normal file
|
@ -0,0 +1 @@
|
||||||
|
2 0.40
|
Loading…
Reference in a new issue