diff --git a/src/optional/genresults.sh b/src/optional/genresults.sh new file mode 100755 index 00000000..88894db5 --- /dev/null +++ b/src/optional/genresults.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# +# Generate the result files used to test the query modules. + +../parallel --header : --result testresults/foo_ echo {a} {b} ::: a 1 2 ::: b 0.30 0.40 +../parallel --header : --result testresults/bar_ echo {a} {b} ::: a 5 6 ::: b 0.70 0.80 diff --git a/src/optional/python/.gitignore b/src/optional/python/.gitignore new file mode 100644 index 00000000..674f765e --- /dev/null +++ b/src/optional/python/.gitignore @@ -0,0 +1,4 @@ +*.pyc +build +dist +*.egg-info diff --git a/src/optional/python/README b/src/optional/python/README new file mode 100644 index 00000000..93c8888d --- /dev/null +++ b/src/optional/python/README @@ -0,0 +1,9 @@ +gnuparallel : Simple loading of GNU parallel result files. + +The gnuparallel package provides a single function, `load`, which +loads results from files generated by GNU parallel into a Pandas +DataFrame object. See `help(gnuparallel.load)` for details. + +Installation: + +`python setup.py install` diff --git a/src/optional/python/gnuparallel/__init__.py b/src/optional/python/gnuparallel/__init__.py new file mode 100644 index 00000000..bee830f8 --- /dev/null +++ b/src/optional/python/gnuparallel/__init__.py @@ -0,0 +1,3 @@ +"""Load GNU Parallel --results files into a Pandas DataFrame.""" + +from _loader import * diff --git a/src/optional/python/gnuparallel/_loader.py b/src/optional/python/gnuparallel/_loader.py new file mode 100755 index 00000000..24a6ca6a --- /dev/null +++ b/src/optional/python/gnuparallel/_loader.py @@ -0,0 +1,143 @@ +""" +A function for loading the --result files generated by GNU Parallel. +""" +__all__ = ['load'] + +from cStringIO import StringIO +import pandas as pd +import os + +def load(_dir, _process=None, _format=None, _stream='stdout', + _prefix=None, _infer_types=True, **options): + """Load files generated with parallel's --result option. + + One use of GNU parallel is to call one command many times, each + time with a different set of arguments. With the --result option, + parallel will capture stdout and stderr from these processes and + store them in files named according to the arguments of each + individual call. This function provides easy loading of these + result files into a Pandas DataFrame. + + Parameters + ---------- + _dir : str + Directory containing the results files. + _process : function, optional + Function that opens a results file and returns an object containing + its results. If not provided, the resulting data frame will include + a column containing the file names, not the actual results. + + If provided, the function should take a filename as its sole parameter. + Whatever the function returns will be stored in the "res" column of + the resulting DataFrame. + _format : dict, optional + Dictionary of format strings, used to convert any provided filter + values to a format matching the results file names. + + For example, if the `foo` parameter to parallel was "0.10" and you pass + foo=0.10 as an option, you will not find the intended file because + str(0.10) == "0.1". To fix this, you should also include the key-value + pair "foo": "%.2f" in the _format dict. This is usually only necessary + for float-valued arguments where rounding or precision issues might + affect the matching process. + _stream : str, optional + Specify either "stdout" or "stderr" to load results files from the + corresponding stream. Default is "stdout". + _prefix : str, optional + Only load result files with a specific prefix. When using the --result + option to parallel it is possible to specify a prefix for all of the + result files. For example, + parallel --result /some/dir/a_prefix ... + would place all result files into the `/some/dir` directory and all of + the file names would begin with "a_prefix". This parameter lets you + filter based on this prefix. If None, allow any prefix. Default None. + _infer_types : bool, optional + Infer data types for option values. All option values are techinically + strings (since they were passed on the command line). When _infer_types + is True, the resulting DataFrame will convert these values to inferred + dtypes, e.g. the number 1 instead of "1". Default True. + **options : kwargs + Additional keyword arguments that will be used to filter the subset + of results included in the output. The values can be either single + values or iterables. If they are iterable, files corresponding to any + of the included values will be considered a match. + + For example, passing `foo=[1,2,3]` will include results from files + corresponding to runs where the parallel argument named `foo` had + the value "1", "2", or "3". + + See also the _format parameter. + + Returns + ------- + res : pandas.DataFrame + A DataFrame with one column named for each of the parallel arguments + and, depending on the _process argument, either: + - A "res" column containing the results corresponding to each run. + - A "resfile" column containing the names of the results files. + """ + if _format is None: + _format = dict() + + # Process the filter options. + for k,v in options.iteritems(): + if hasattr(v, '__iter__') and not isinstance(v, basestring): + pass # v is already a container type. + else: + options[k] = [v] + for k,v in options.iteritems(): + options[k] = set(_stringify(x, _format.get(k, '')) for x in v) + options['_stream'] = [_stream] + if _prefix: + options['_prefix'] = [_prefix] + + # Iterate over results files and collect the matches. + matches = [] + for file in os.listdir(_dir): + metadata = _parse_name(file) + metadata['resfile'] = os.path.join(_dir, metadata['resfile']) + if _select(metadata, options): + matches.append(metadata) + + # Create a DataFrame from the matches. + df = pd.DataFrame(matches) + if _process and not df.empty: + df['res'] = df.resfile.apply(_process) + df = df.drop('resfile', axis=1) + + # Optionally try to convert string argument values to numeric types. + if _infer_types: + buf = StringIO() + df.to_csv(buf) + df = pd.read_csv(StringIO(buf.getvalue()), index_col=0) + + return df + +def _parse_name(file, sep='\t'): + """Return a dict containing metadata extracted from the file name.""" + tokens = file.split(sep) + prefix_stream = tokens[0] + metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])} + + stream_index = prefix_stream.find('stdout') + if stream_index == -1: + stream_index = prefix_stream.find('stderr') + prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:] + + metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file}) + return metadata + +def _select(metadata, filter): + """Return true if the metadata entry matches the filter, False otherwise.""" + if any(k not in metadata for k in filter): + return False + if any(all(v != metadata[k] for v in vs) for k,vs in filter.iteritems()): + return False + return True + +def _stringify(x, fmt): + """Return the string representation of x, using a format string if provided""" + if fmt: + return fmt % x + else: + return str(x) diff --git a/src/optional/python/setup.py b/src/optional/python/setup.py new file mode 100755 index 00000000..a443fcd1 --- /dev/null +++ b/src/optional/python/setup.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup( + name = 'gnuparallel', + version = '0.1', + description = 'Load GNU parallel result files.', + author = 'Drew Frank', + author_email = 'drewfrank@gmail.com', + packages = [ + 'gnuparallel' + ], + install_requires = ['pandas'] +) diff --git a/src/optional/python/tests/test_loader.py b/src/optional/python/tests/test_loader.py new file mode 100644 index 00000000..ae480be4 --- /dev/null +++ b/src/optional/python/tests/test_loader.py @@ -0,0 +1,66 @@ +import pandas as pd +import unittest + +from gnuparallel import load + +result_dir = '../../testresults' + +class TestLoader(unittest.TestCase): + + def test_basics(self): + df = load(result_dir) + self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream'])) + self.assertEqual(df.shape[0], 8) + + def test_prefix(self): + df = load(result_dir, _prefix='foo_') + self.assertEqual(df.shape[0], 4) + self.assertEqual(df.a.sum(), 6) + + df = load(result_dir, _prefix='bar_') + self.assertEqual(df.shape[0], 4) + self.assertEqual(df.a.sum(), 22) + + df = load(result_dir, _prefix='BAD') + self.assertTrue(df.empty) + + def test_filters(self): + df = load(result_dir, a=2) + self.assertEqual(df.shape[0], 2) + self.assertEqual(df.a.sum(), 4) + + df = load(result_dir, a=[2]) + self.assertEqual(df.shape[0], 2) + self.assertEqual(df.a.sum(), 4) + + df = load(result_dir, a=[1,2]) + self.assertEqual(df.shape[0], 4) + self.assertEqual(df.a.sum(), 6) + + df = load(result_dir, a=1000) + self.assertTrue(df.empty) + + def test_infer_types(self): + df = load(result_dir) + self.assertEqual(df.a.dtype, pd.np.int64) + + df = load(result_dir, _infer_types=False) + self.assertEqual(df.a.dtype, pd.np.object_) + + def test_format(self): + df = load(result_dir, b=0.3) + self.assertTrue(df.empty) + + df = load(result_dir, b=0.3, _format={'b': '%.2f'}) + self.assertEqual(df.shape[0], 2) + + def test_stream(self): + df = load(result_dir, _stream='stderr') + self.assertTrue((df._stream == 'stderr').all()) + + def test_process(self): + df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum()) + self.assertAlmostEqual(df.res[0], 1.4) + +if __name__ == '__main__': + unittest.main() diff --git "a/src/optional/testresults/bar_stderr\ta\t5\tb\t0.70" "b/src/optional/testresults/bar_stderr\ta\t5\tb\t0.70" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/bar_stderr\ta\t5\tb\t0.80" "b/src/optional/testresults/bar_stderr\ta\t5\tb\t0.80" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/bar_stderr\ta\t6\tb\t0.70" "b/src/optional/testresults/bar_stderr\ta\t6\tb\t0.70" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/bar_stderr\ta\t6\tb\t0.80" "b/src/optional/testresults/bar_stderr\ta\t6\tb\t0.80" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/bar_stdout\ta\t5\tb\t0.70" "b/src/optional/testresults/bar_stdout\ta\t5\tb\t0.70" new file mode 100644 index 00000000..c33bb220 --- /dev/null +++ "b/src/optional/testresults/bar_stdout\ta\t5\tb\t0.70" @@ -0,0 +1 @@ +5 0.70 diff --git "a/src/optional/testresults/bar_stdout\ta\t5\tb\t0.80" "b/src/optional/testresults/bar_stdout\ta\t5\tb\t0.80" new file mode 100644 index 00000000..2b615136 --- /dev/null +++ "b/src/optional/testresults/bar_stdout\ta\t5\tb\t0.80" @@ -0,0 +1 @@ +5 0.80 diff --git "a/src/optional/testresults/bar_stdout\ta\t6\tb\t0.70" "b/src/optional/testresults/bar_stdout\ta\t6\tb\t0.70" new file mode 100644 index 00000000..486ba0b0 --- /dev/null +++ "b/src/optional/testresults/bar_stdout\ta\t6\tb\t0.70" @@ -0,0 +1 @@ +6 0.70 diff --git "a/src/optional/testresults/bar_stdout\ta\t6\tb\t0.80" "b/src/optional/testresults/bar_stdout\ta\t6\tb\t0.80" new file mode 100644 index 00000000..482deb0f --- /dev/null +++ "b/src/optional/testresults/bar_stdout\ta\t6\tb\t0.80" @@ -0,0 +1 @@ +6 0.80 diff --git "a/src/optional/testresults/foo_stderr\ta\t1\tb\t0.30" "b/src/optional/testresults/foo_stderr\ta\t1\tb\t0.30" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/foo_stderr\ta\t1\tb\t0.40" "b/src/optional/testresults/foo_stderr\ta\t1\tb\t0.40" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/foo_stderr\ta\t2\tb\t0.30" "b/src/optional/testresults/foo_stderr\ta\t2\tb\t0.30" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/foo_stderr\ta\t2\tb\t0.40" "b/src/optional/testresults/foo_stderr\ta\t2\tb\t0.40" new file mode 100644 index 00000000..e69de29b diff --git "a/src/optional/testresults/foo_stdout\ta\t1\tb\t0.30" "b/src/optional/testresults/foo_stdout\ta\t1\tb\t0.30" new file mode 100644 index 00000000..a2278a06 --- /dev/null +++ "b/src/optional/testresults/foo_stdout\ta\t1\tb\t0.30" @@ -0,0 +1 @@ +1 0.30 diff --git "a/src/optional/testresults/foo_stdout\ta\t1\tb\t0.40" "b/src/optional/testresults/foo_stdout\ta\t1\tb\t0.40" new file mode 100644 index 00000000..42afa704 --- /dev/null +++ "b/src/optional/testresults/foo_stdout\ta\t1\tb\t0.40" @@ -0,0 +1 @@ +1 0.40 diff --git "a/src/optional/testresults/foo_stdout\ta\t2\tb\t0.30" "b/src/optional/testresults/foo_stdout\ta\t2\tb\t0.30" new file mode 100644 index 00000000..c82ce586 --- /dev/null +++ "b/src/optional/testresults/foo_stdout\ta\t2\tb\t0.30" @@ -0,0 +1 @@ +2 0.30 diff --git "a/src/optional/testresults/foo_stdout\ta\t2\tb\t0.40" "b/src/optional/testresults/foo_stdout\ta\t2\tb\t0.40" new file mode 100644 index 00000000..b7964383 --- /dev/null +++ "b/src/optional/testresults/foo_stdout\ta\t2\tb\t0.40" @@ -0,0 +1 @@ +2 0.40