optional: Python loader works with new --results format.

2024-12-23 05:07:54 +00:00 · 2012-12-21 20:01:23 -08:00 · 2012-12-21 20:01:23 -08:00 · 701445aac6
parent 05a08c55b6
commit 701445aac6
20 changed files with 25 additions and 59 deletions
--- a/src/optional/genresults.sh
+++ b/src/optional/genresults.sh
@ -2,5 +2,4 @@
 #
 # Generate the result files used to test the query modules.

-../parallel --header : --result testresults/foo echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
-../parallel --header : --result testresults/bar echo {a} {b} ::: a 5 6 ::: b 0.70 0.80
+../parallel --header : --result testresults echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
--- a/src/optional/python/README
+++ b/src/optional/python/README
@ -14,7 +14,7 @@ Sample usage:
 1. Generate some results files by running parallel from the command line:

    # mkdir out
-    # parallel --header : --results out/pfx echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four
+    # parallel --header : --results out echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four

 2. Load the results using the gnuparallel Python package:

@ -24,16 +24,13 @@ Sample usage:
    Type "help", "copyright", "credits" or "license" for more information.
    >>> import gnuparallel
    >>> help(gnuparallel.load)
-!!! THIS PART IS BROKEN !!!
    >>> my_df = gnuparallel.load('out')
    >>> my_df
-      _prefix _stream  arg1   arg2                          resfile
-    0     pfx  stdout     1  three   out/pfxstdout	arg1	1	arg2	three
-    1     pfx  stdout     1   four   out/pfxstdout	arg1	1	arg2	four
-    2     pfx  stdout     2  three   out/pfxstdout	arg1	2	arg2	three
-    3     pfx  stdout     2   four   out/pfxstdout	arg1	2	arg2	four
-    >>> my_df.tail(1)
-    3     pfx  stdout     2   four   out/pfxstdout	arg1	2	arg2	four
+      _stream  arg1    arg2                       resfile
+    0  stdout     2   three  out/arg1/2/arg2/three/stdout
+    1  stdout     2    four   out/arg1/2/arg2/four/stdout
+    2  stdout     1   three  out/arg1/1/arg2/three/stdout
+    3  stdout     1    four   out/arg1/1/arg2/four/stdout

 See documentation for the pandas project (http://pandas.pydata.org/) for
 instructions on how to access and manipulate the loaded results.
--- a/src/optional/python/gnuparallel/_loader.py
+++ b/src/optional/python/gnuparallel/_loader.py
@ -8,7 +8,7 @@ import pandas as pd
 import os

 def load(_dir, _process=None, _format=None, _stream='stdout',
-        _prefix=None, _infer_types=True, **options):
+        _infer_types=True, **options):
    """Load files generated with parallel's --result option.

    One use of GNU parallel is to call one command many times, each
@ -43,14 +43,6 @@ def load(_dir, _process=None, _format=None, _stream='stdout',
    _stream : str, optional
        Specify either "stdout" or "stderr" to load results files from the
        corresponding stream. Default is "stdout".
-    _prefix : str, optional
-        Only load result files with a specific prefix. When using the --result
-        option to parallel it is possible to specify a prefix for all of the
-        result files. For example,
-            parallel --result /some/dir/a_prefix ...
-        would place all result files into the `/some/dir` directory and all of
-        the file names would begin with "a_prefix". This parameter lets you
-        filter based on this prefix. If None, allow any prefix. Default None.
    _infer_types : bool, optional
        Infer data types for option values. All option values are techinically
        strings (since they were passed on the command line). When _infer_types
@ -88,14 +80,15 @@ def load(_dir, _process=None, _format=None, _stream='stdout',
    for k,v in options.iteritems():
        options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
    options['_stream'] = [_stream]
-    if _prefix:
-        options['_prefix'] = [_prefix]

    # Iterate over results files and collect the matches.
    matches = []
-    for file in os.listdir(_dir):
-        metadata = _parse_name(file)
-        metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
+    normdir = os.path.normpath(_dir)
+    for path, file in _find_results(normdir):
+        # Don't include the root path as part of the metadata string.
+        metadata = _parse_path(path[len(normdir):])
+        metadata['_stream'] = file
+        metadata['resfile'] = os.path.join(path, file)
        if _select(metadata, options):
            matches.append(metadata)

@ -117,19 +110,16 @@ def load(_dir, _process=None, _format=None, _stream='stdout',

    return df

-def _parse_name(file, sep='\t'):
-    """Return a dict containing metadata extracted from the file name."""
-    tokens = file.split(sep)
-    prefix_stream = tokens[0]
-    metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
+def _find_results(root):
+    """Find all regular files in a directory."""
+    for (path, dirs, files) in os.walk(root):
+        for file in files:
+            yield (path, file)

-    stream_index = prefix_stream.find('stdout')
-    if stream_index == -1:
-        stream_index = prefix_stream.find('stderr')
-    prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
-
-    metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
-    return metadata
+def _parse_path(path):
+    """Return a dict containing metadata extracted from a file's path."""
+    tokens = path.split(os.path.sep)
+    return {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
    
 def _select(metadata, filter):
    """Return true if the metadata entry matches the filter, False otherwise."""
--- a/src/optional/python/tests/test_loader.py
+++ b/src/optional/python/tests/test_loader.py
@ -9,20 +9,8 @@ class TestLoader(unittest.TestCase):

    def test_basics(self):
        df = load(result_dir)
-        self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
-        self.assertEqual(df.shape[0], 8)
-
-    def test_prefix(self):
-        df = load(result_dir, _prefix='foo_')
+        self.assertEqual(set(df.columns), set(['a', 'b', 'resfile', '_stream']))
        self.assertEqual(df.shape[0], 4)
-        self.assertEqual(df.a.sum(), 6)
-
-        df = load(result_dir, _prefix='bar_')
-        self.assertEqual(df.shape[0], 4)
-        self.assertEqual(df.a.sum(), 22)
-
-        df = load(result_dir, _prefix='BAD')
-        self.assertTrue(df.empty)

    def test_filters(self):
        df = load(result_dir, a=2)
@ -60,7 +48,7 @@ class TestLoader(unittest.TestCase):

    def test_process(self):
        df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
-        self.assertAlmostEqual(df.res[0], 1.4)
+        self.assertAlmostEqual(df.sum()['res'], 2.7)

 if __name__ == '__main__':
    unittest.main()
--- a/src/optional/testresults/bar/a/5/b/0.70/stderr
+++ b/src/optional/testresults/bar/a/5/b/0.70/stderr
--- a/src/optional/testresults/bar/a/5/b/0.70/stdout
+++ b/src/optional/testresults/bar/a/5/b/0.70/stdout
@ -1 +0,0 @@
-5 0.70
--- a/src/optional/testresults/bar/a/5/b/0.80/stderr
+++ b/src/optional/testresults/bar/a/5/b/0.80/stderr
--- a/src/optional/testresults/bar/a/5/b/0.80/stdout
+++ b/src/optional/testresults/bar/a/5/b/0.80/stdout
@ -1 +0,0 @@
-5 0.80
--- a/src/optional/testresults/bar/a/6/b/0.70/stderr
+++ b/src/optional/testresults/bar/a/6/b/0.70/stderr
--- a/src/optional/testresults/bar/a/6/b/0.70/stdout
+++ b/src/optional/testresults/bar/a/6/b/0.70/stdout
@ -1 +0,0 @@
-6 0.70
--- a/src/optional/testresults/bar/a/6/b/0.80/stderr
+++ b/src/optional/testresults/bar/a/6/b/0.80/stderr
--- a/src/optional/testresults/bar/a/6/b/0.80/stdout
+++ b/src/optional/testresults/bar/a/6/b/0.80/stdout
@ -1 +0,0 @@
-6 0.80
--- a/src/optional/testresults/foo/a/1/b/0.30/stderr
+++ b/src/optional/testresults/foo/a/1/b/0.30/stderr
--- a/src/optional/testresults/foo/a/1/b/0.30/stdout
+++ b/src/optional/testresults/foo/a/1/b/0.30/stdout
@ -1 +0,0 @@
-1 0.30
--- a/src/optional/testresults/foo/a/1/b/0.40/stderr
+++ b/src/optional/testresults/foo/a/1/b/0.40/stderr
--- a/src/optional/testresults/foo/a/1/b/0.40/stdout
+++ b/src/optional/testresults/foo/a/1/b/0.40/stdout
@ -1 +0,0 @@
-1 0.40
--- a/src/optional/testresults/foo/a/2/b/0.30/stderr
+++ b/src/optional/testresults/foo/a/2/b/0.30/stderr
--- a/src/optional/testresults/foo/a/2/b/0.30/stdout
+++ b/src/optional/testresults/foo/a/2/b/0.30/stdout
@ -1 +0,0 @@
-2 0.30
--- a/src/optional/testresults/foo/a/2/b/0.40/stderr
+++ b/src/optional/testresults/foo/a/2/b/0.40/stderr
--- a/src/optional/testresults/foo/a/2/b/0.40/stdout
+++ b/src/optional/testresults/foo/a/2/b/0.40/stdout
@ -1 +0,0 @@
-2 0.40