add gnuparallel python package.

2024-11-23 06:27:55 +00:00 · 2012-10-07 22:48:25 -07:00 · 2012-10-07 22:48:25 -07:00 · b2f8450bf3
parent 141eb4d5ea
commit b2f8450bf3
23 changed files with 254 additions and 0 deletions
--- a/src/optional/genresults.sh
+++ b/src/optional/genresults.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 #
 # Generate the result files used to test the query modules.
 ../parallel --header : --result testresults/foo_ echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
 ../parallel --header : --result testresults/bar_ echo {a} {b} ::: a 5 6 ::: b 0.70 0.80
--- a/src/optional/python/.gitignore
+++ b/src/optional/python/.gitignore
@ -0,0 +1,4 @@
 *.pyc
 build
 dist
 *.egg-info
--- a/src/optional/python/README
+++ b/src/optional/python/README
@ -0,0 +1,9 @@
 gnuparallel : Simple loading of GNU parallel result files.
 The gnuparallel package provides a single function, `load`, which
 loads results from files generated by GNU parallel into a Pandas
 DataFrame object. See `help(gnuparallel.load)` for details.
 Installation:
 `python setup.py install`
--- a/src/optional/python/gnuparallel/init.py
+++ b/src/optional/python/gnuparallel/init.py
@ -0,0 +1,3 @@
 """Load GNU Parallel --results files into a Pandas DataFrame."""
 from _loader import *
--- a/src/optional/python/gnuparallel/_loader.py
+++ b/src/optional/python/gnuparallel/_loader.py
@ -0,0 +1,143 @@
 """
 A function for loading the --result files generated by GNU Parallel.
 """
 __all__ = ['load']
 from cStringIO import StringIO
 import pandas as pd
 import os
 def load(_dir, _process=None, _format=None, _stream='stdout',
        _prefix=None, _infer_types=True, **options):
    """Load files generated with parallel's --result option.
    One use of GNU parallel is to call one command many times, each
    time with a different set of arguments. With the --result option,
    parallel will capture stdout and stderr from these processes and
    store them in files named according to the arguments of each
    individual call. This function provides easy loading of these
    result files into a Pandas DataFrame.
    Parameters
    ----------
    _dir : str
        Directory containing the results files.
    _process : function, optional
        Function that opens a results file and returns an object containing
        its results. If not provided, the resulting data frame will include
        a column containing the file names, not the actual results.
        If provided, the function should take a filename as its sole parameter.
        Whatever the function returns will be stored in the "res" column of
        the resulting DataFrame.
    _format : dict, optional
        Dictionary of format strings, used to convert any provided filter
        values to a format matching the results file names.
        For example, if the `foo` parameter to parallel was "0.10" and you pass
        foo=0.10 as an option, you will not find the intended file because
        str(0.10) == "0.1". To fix this, you should also include the key-value
        pair "foo": "%.2f" in the _format dict. This is usually only necessary
        for float-valued arguments where rounding or precision issues might
        affect the matching process.
    _stream : str, optional
        Specify either "stdout" or "stderr" to load results files from the
        corresponding stream. Default is "stdout".
    _prefix : str, optional
        Only load result files with a specific prefix. When using the --result
        option to parallel it is possible to specify a prefix for all of the
        result files. For example,
            parallel --result /some/dir/a_prefix ...
        would place all result files into the `/some/dir` directory and all of
        the file names would begin with "a_prefix". This parameter lets you
        filter based on this prefix. If None, allow any prefix. Default None.
    _infer_types : bool, optional
        Infer data types for option values. All option values are techinically
        strings (since they were passed on the command line). When _infer_types
        is True, the resulting DataFrame will convert these values to inferred
        dtypes, e.g. the number 1 instead of "1". Default True.
    **options : kwargs
        Additional keyword arguments that will be used to filter the subset
        of results included in the output. The values can be either single
        values or iterables. If they are iterable, files corresponding to any
        of the included values will be considered a match.
        For example, passing `foo=[1,2,3]` will include results from files
        corresponding to runs where the parallel argument named `foo` had
        the value "1", "2", or "3".
        See also the _format parameter.
    Returns
    -------
    res : pandas.DataFrame
        A DataFrame with one column named for each of the parallel arguments
        and, depending on the _process argument, either:
        - A "res" column containing the results corresponding to each run.
        - A "resfile" column containing the names of the results files.
    """
    if _format is None:
        _format = dict()
    # Process the filter options.
    for k,v in options.iteritems():
        if hasattr(v, '__iter__') and not isinstance(v, basestring):
            pass # v is already a container type.
        else:
            options[k] = [v]
    for k,v in options.iteritems():
        options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
    options['_stream'] = [_stream]
    if _prefix:
        options['_prefix'] = [_prefix]
    # Iterate over results files and collect the matches.
    matches = []
    for file in os.listdir(_dir):
        metadata = _parse_name(file)
        metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
        if _select(metadata, options):
            matches.append(metadata)
    # Create a DataFrame from the matches.
    df = pd.DataFrame(matches)
    if _process and not df.empty:
        df['res'] = df.resfile.apply(_process)
        df = df.drop('resfile', axis=1)
    # Optionally try to convert string argument values to numeric types.
    if _infer_types:
        buf = StringIO()
        df.to_csv(buf)
        df = pd.read_csv(StringIO(buf.getvalue()), index_col=0)
    return df
 def _parse_name(file, sep='\t'):
    """Return a dict containing metadata extracted from the file name."""
    tokens = file.split(sep)
    prefix_stream = tokens[0]
    metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
    stream_index = prefix_stream.find('stdout')
    if stream_index == -1:
        stream_index = prefix_stream.find('stderr')
    prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
    metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
    return metadata
 def _select(metadata, filter):
    """Return true if the metadata entry matches the filter, False otherwise."""
    if any(k not in metadata for k in filter):
        return False
    if any(all(v != metadata[k] for v in vs) for k,vs in filter.iteritems()):
        return False
    return True
 def _stringify(x, fmt):
    """Return the string representation of x, using a format string if provided"""
    if fmt:
        return fmt % x
    else:
        return str(x)
--- a/src/optional/python/setup.py
+++ b/src/optional/python/setup.py
@ -0,0 +1,15 @@
 #!/usr/bin/env python
 from distutils.core import setup
 setup(
    name = 'gnuparallel',
    version = '0.1',
    description = 'Load GNU parallel result files.',
    author = 'Drew Frank',
    author_email = 'drewfrank@gmail.com',
    packages = [
        'gnuparallel'
    ],
    install_requires = ['pandas']
 )
--- a/src/optional/python/tests/test_loader.py
+++ b/src/optional/python/tests/test_loader.py
@ -0,0 +1,66 @@
 import pandas as pd
 import unittest
 from gnuparallel import load
 result_dir = '../../testresults'
 class TestLoader(unittest.TestCase):
    def test_basics(self):
        df = load(result_dir)
        self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
        self.assertEqual(df.shape[0], 8)
    def test_prefix(self):
        df = load(result_dir, _prefix='foo_')
        self.assertEqual(df.shape[0], 4)
        self.assertEqual(df.a.sum(), 6)
        df = load(result_dir, _prefix='bar_')
        self.assertEqual(df.shape[0], 4)
        self.assertEqual(df.a.sum(), 22)
        df = load(result_dir, _prefix='BAD')
        self.assertTrue(df.empty)
    def test_filters(self):
        df = load(result_dir, a=2)
        self.assertEqual(df.shape[0], 2)
        self.assertEqual(df.a.sum(), 4)
        df = load(result_dir, a=[2])
        self.assertEqual(df.shape[0], 2)
        self.assertEqual(df.a.sum(), 4)
        df = load(result_dir, a=[1,2])
        self.assertEqual(df.shape[0], 4)
        self.assertEqual(df.a.sum(), 6)
        df = load(result_dir, a=1000)
        self.assertTrue(df.empty)
    def test_infer_types(self):
        df = load(result_dir)
        self.assertEqual(df.a.dtype, pd.np.int64)
        df = load(result_dir, _infer_types=False)
        self.assertEqual(df.a.dtype, pd.np.object_)
    def test_format(self):
        df = load(result_dir, b=0.3)
        self.assertTrue(df.empty)
        df = load(result_dir, b=0.3, _format={'b': '%.2f'})
        self.assertEqual(df.shape[0], 2)
    def test_stream(self):
        df = load(result_dir, _stream='stderr')
        self.assertTrue((df._stream == 'stderr').all())
    def test_process(self):
        df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
        self.assertAlmostEqual(df.res[0], 1.4)
 if __name__ == '__main__':
    unittest.main()
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
 5 0.70
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
 5 0.80
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
 6 0.70
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
 6 0.80
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
 1 0.30
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
 1 0.40
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
 2 0.30
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
 2 0.40
		`@ -0,0 +1,3 @@`
							`"""Load GNU Parallel --results files into a Pandas DataFrame."""`

							`from _loader import *`