From 701445aac649c3dc425f1ec579b1ba1f42ce8d5f Mon Sep 17 00:00:00 2001 From: Drew Frank Date: Fri, 21 Dec 2012 20:01:23 -0800 Subject: [PATCH] optional: Python loader works with new --results format. --- src/optional/genresults.sh | 3 +- src/optional/python/README | 15 +++---- src/optional/python/gnuparallel/_loader.py | 42 +++++++------------ src/optional/python/tests/test_loader.py | 16 +------ .../testresults/bar/a/5/b/0.70/stderr | 0 .../testresults/bar/a/5/b/0.70/stdout | 1 - .../testresults/bar/a/5/b/0.80/stderr | 0 .../testresults/bar/a/5/b/0.80/stdout | 1 - .../testresults/bar/a/6/b/0.70/stderr | 0 .../testresults/bar/a/6/b/0.70/stdout | 1 - .../testresults/bar/a/6/b/0.80/stderr | 0 .../testresults/bar/a/6/b/0.80/stdout | 1 - .../testresults/foo/a/1/b/0.30/stderr | 0 .../testresults/foo/a/1/b/0.30/stdout | 1 - .../testresults/foo/a/1/b/0.40/stderr | 0 .../testresults/foo/a/1/b/0.40/stdout | 1 - .../testresults/foo/a/2/b/0.30/stderr | 0 .../testresults/foo/a/2/b/0.30/stdout | 1 - .../testresults/foo/a/2/b/0.40/stderr | 0 .../testresults/foo/a/2/b/0.40/stdout | 1 - 20 files changed, 25 insertions(+), 59 deletions(-) delete mode 100644 src/optional/testresults/bar/a/5/b/0.70/stderr delete mode 100644 src/optional/testresults/bar/a/5/b/0.70/stdout delete mode 100644 src/optional/testresults/bar/a/5/b/0.80/stderr delete mode 100644 src/optional/testresults/bar/a/5/b/0.80/stdout delete mode 100644 src/optional/testresults/bar/a/6/b/0.70/stderr delete mode 100644 src/optional/testresults/bar/a/6/b/0.70/stdout delete mode 100644 src/optional/testresults/bar/a/6/b/0.80/stderr delete mode 100644 src/optional/testresults/bar/a/6/b/0.80/stdout delete mode 100644 src/optional/testresults/foo/a/1/b/0.30/stderr delete mode 100644 src/optional/testresults/foo/a/1/b/0.30/stdout delete mode 100644 src/optional/testresults/foo/a/1/b/0.40/stderr delete mode 100644 src/optional/testresults/foo/a/1/b/0.40/stdout delete mode 100644 src/optional/testresults/foo/a/2/b/0.30/stderr delete mode 100644 src/optional/testresults/foo/a/2/b/0.30/stdout delete mode 100644 src/optional/testresults/foo/a/2/b/0.40/stderr delete mode 100644 src/optional/testresults/foo/a/2/b/0.40/stdout diff --git a/src/optional/genresults.sh b/src/optional/genresults.sh index 25b0ccad..286fc53c 100755 --- a/src/optional/genresults.sh +++ b/src/optional/genresults.sh @@ -2,5 +2,4 @@ # # Generate the result files used to test the query modules. -../parallel --header : --result testresults/foo echo {a} {b} ::: a 1 2 ::: b 0.30 0.40 -../parallel --header : --result testresults/bar echo {a} {b} ::: a 5 6 ::: b 0.70 0.80 +../parallel --header : --result testresults echo {a} {b} ::: a 1 2 ::: b 0.30 0.40 diff --git a/src/optional/python/README b/src/optional/python/README index 44433e5b..a1af49f3 100644 --- a/src/optional/python/README +++ b/src/optional/python/README @@ -14,7 +14,7 @@ Sample usage: 1. Generate some results files by running parallel from the command line: # mkdir out - # parallel --header : --results out/pfx echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four + # parallel --header : --results out echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four 2. Load the results using the gnuparallel Python package: @@ -24,16 +24,13 @@ Sample usage: Type "help", "copyright", "credits" or "license" for more information. >>> import gnuparallel >>> help(gnuparallel.load) -!!! THIS PART IS BROKEN !!! >>> my_df = gnuparallel.load('out') >>> my_df - _prefix _stream arg1 arg2 resfile - 0 pfx stdout 1 three out/pfxstdout arg1 1 arg2 three - 1 pfx stdout 1 four out/pfxstdout arg1 1 arg2 four - 2 pfx stdout 2 three out/pfxstdout arg1 2 arg2 three - 3 pfx stdout 2 four out/pfxstdout arg1 2 arg2 four - >>> my_df.tail(1) - 3 pfx stdout 2 four out/pfxstdout arg1 2 arg2 four + _stream arg1 arg2 resfile + 0 stdout 2 three out/arg1/2/arg2/three/stdout + 1 stdout 2 four out/arg1/2/arg2/four/stdout + 2 stdout 1 three out/arg1/1/arg2/three/stdout + 3 stdout 1 four out/arg1/1/arg2/four/stdout See documentation for the pandas project (http://pandas.pydata.org/) for instructions on how to access and manipulate the loaded results. diff --git a/src/optional/python/gnuparallel/_loader.py b/src/optional/python/gnuparallel/_loader.py index afee6a64..ca2c0d18 100755 --- a/src/optional/python/gnuparallel/_loader.py +++ b/src/optional/python/gnuparallel/_loader.py @@ -8,7 +8,7 @@ import pandas as pd import os def load(_dir, _process=None, _format=None, _stream='stdout', - _prefix=None, _infer_types=True, **options): + _infer_types=True, **options): """Load files generated with parallel's --result option. One use of GNU parallel is to call one command many times, each @@ -43,14 +43,6 @@ def load(_dir, _process=None, _format=None, _stream='stdout', _stream : str, optional Specify either "stdout" or "stderr" to load results files from the corresponding stream. Default is "stdout". - _prefix : str, optional - Only load result files with a specific prefix. When using the --result - option to parallel it is possible to specify a prefix for all of the - result files. For example, - parallel --result /some/dir/a_prefix ... - would place all result files into the `/some/dir` directory and all of - the file names would begin with "a_prefix". This parameter lets you - filter based on this prefix. If None, allow any prefix. Default None. _infer_types : bool, optional Infer data types for option values. All option values are techinically strings (since they were passed on the command line). When _infer_types @@ -88,14 +80,15 @@ def load(_dir, _process=None, _format=None, _stream='stdout', for k,v in options.iteritems(): options[k] = set(_stringify(x, _format.get(k, '')) for x in v) options['_stream'] = [_stream] - if _prefix: - options['_prefix'] = [_prefix] # Iterate over results files and collect the matches. matches = [] - for file in os.listdir(_dir): - metadata = _parse_name(file) - metadata['resfile'] = os.path.join(_dir, metadata['resfile']) + normdir = os.path.normpath(_dir) + for path, file in _find_results(normdir): + # Don't include the root path as part of the metadata string. + metadata = _parse_path(path[len(normdir):]) + metadata['_stream'] = file + metadata['resfile'] = os.path.join(path, file) if _select(metadata, options): matches.append(metadata) @@ -117,19 +110,16 @@ def load(_dir, _process=None, _format=None, _stream='stdout', return df -def _parse_name(file, sep='\t'): - """Return a dict containing metadata extracted from the file name.""" - tokens = file.split(sep) - prefix_stream = tokens[0] - metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])} +def _find_results(root): + """Find all regular files in a directory.""" + for (path, dirs, files) in os.walk(root): + for file in files: + yield (path, file) - stream_index = prefix_stream.find('stdout') - if stream_index == -1: - stream_index = prefix_stream.find('stderr') - prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:] - - metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file}) - return metadata +def _parse_path(path): + """Return a dict containing metadata extracted from a file's path.""" + tokens = path.split(os.path.sep) + return {k:v for k,v in zip(tokens[1::2], tokens[2::2])} def _select(metadata, filter): """Return true if the metadata entry matches the filter, False otherwise.""" diff --git a/src/optional/python/tests/test_loader.py b/src/optional/python/tests/test_loader.py index ae480be4..ab06be76 100644 --- a/src/optional/python/tests/test_loader.py +++ b/src/optional/python/tests/test_loader.py @@ -9,20 +9,8 @@ class TestLoader(unittest.TestCase): def test_basics(self): df = load(result_dir) - self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream'])) - self.assertEqual(df.shape[0], 8) - - def test_prefix(self): - df = load(result_dir, _prefix='foo_') + self.assertEqual(set(df.columns), set(['a', 'b', 'resfile', '_stream'])) self.assertEqual(df.shape[0], 4) - self.assertEqual(df.a.sum(), 6) - - df = load(result_dir, _prefix='bar_') - self.assertEqual(df.shape[0], 4) - self.assertEqual(df.a.sum(), 22) - - df = load(result_dir, _prefix='BAD') - self.assertTrue(df.empty) def test_filters(self): df = load(result_dir, a=2) @@ -60,7 +48,7 @@ class TestLoader(unittest.TestCase): def test_process(self): df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum()) - self.assertAlmostEqual(df.res[0], 1.4) + self.assertAlmostEqual(df.sum()['res'], 2.7) if __name__ == '__main__': unittest.main() diff --git a/src/optional/testresults/bar/a/5/b/0.70/stderr b/src/optional/testresults/bar/a/5/b/0.70/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/bar/a/5/b/0.70/stdout b/src/optional/testresults/bar/a/5/b/0.70/stdout deleted file mode 100644 index c33bb220..00000000 --- a/src/optional/testresults/bar/a/5/b/0.70/stdout +++ /dev/null @@ -1 +0,0 @@ -5 0.70 diff --git a/src/optional/testresults/bar/a/5/b/0.80/stderr b/src/optional/testresults/bar/a/5/b/0.80/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/bar/a/5/b/0.80/stdout b/src/optional/testresults/bar/a/5/b/0.80/stdout deleted file mode 100644 index 2b615136..00000000 --- a/src/optional/testresults/bar/a/5/b/0.80/stdout +++ /dev/null @@ -1 +0,0 @@ -5 0.80 diff --git a/src/optional/testresults/bar/a/6/b/0.70/stderr b/src/optional/testresults/bar/a/6/b/0.70/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/bar/a/6/b/0.70/stdout b/src/optional/testresults/bar/a/6/b/0.70/stdout deleted file mode 100644 index 486ba0b0..00000000 --- a/src/optional/testresults/bar/a/6/b/0.70/stdout +++ /dev/null @@ -1 +0,0 @@ -6 0.70 diff --git a/src/optional/testresults/bar/a/6/b/0.80/stderr b/src/optional/testresults/bar/a/6/b/0.80/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/bar/a/6/b/0.80/stdout b/src/optional/testresults/bar/a/6/b/0.80/stdout deleted file mode 100644 index 482deb0f..00000000 --- a/src/optional/testresults/bar/a/6/b/0.80/stdout +++ /dev/null @@ -1 +0,0 @@ -6 0.80 diff --git a/src/optional/testresults/foo/a/1/b/0.30/stderr b/src/optional/testresults/foo/a/1/b/0.30/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/foo/a/1/b/0.30/stdout b/src/optional/testresults/foo/a/1/b/0.30/stdout deleted file mode 100644 index a2278a06..00000000 --- a/src/optional/testresults/foo/a/1/b/0.30/stdout +++ /dev/null @@ -1 +0,0 @@ -1 0.30 diff --git a/src/optional/testresults/foo/a/1/b/0.40/stderr b/src/optional/testresults/foo/a/1/b/0.40/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/foo/a/1/b/0.40/stdout b/src/optional/testresults/foo/a/1/b/0.40/stdout deleted file mode 100644 index 42afa704..00000000 --- a/src/optional/testresults/foo/a/1/b/0.40/stdout +++ /dev/null @@ -1 +0,0 @@ -1 0.40 diff --git a/src/optional/testresults/foo/a/2/b/0.30/stderr b/src/optional/testresults/foo/a/2/b/0.30/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/foo/a/2/b/0.30/stdout b/src/optional/testresults/foo/a/2/b/0.30/stdout deleted file mode 100644 index c82ce586..00000000 --- a/src/optional/testresults/foo/a/2/b/0.30/stdout +++ /dev/null @@ -1 +0,0 @@ -2 0.30 diff --git a/src/optional/testresults/foo/a/2/b/0.40/stderr b/src/optional/testresults/foo/a/2/b/0.40/stderr deleted file mode 100644 index e69de29b..00000000 diff --git a/src/optional/testresults/foo/a/2/b/0.40/stdout b/src/optional/testresults/foo/a/2/b/0.40/stdout deleted file mode 100644 index b7964383..00000000 --- a/src/optional/testresults/foo/a/2/b/0.40/stdout +++ /dev/null @@ -1 +0,0 @@ -2 0.40