mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-12-23 05:07:54 +00:00
optional: Python loader works with new --results format.
This commit is contained in:
parent
05a08c55b6
commit
701445aac6
|
@ -2,5 +2,4 @@
|
|||
#
|
||||
# Generate the result files used to test the query modules.
|
||||
|
||||
../parallel --header : --result testresults/foo echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
|
||||
../parallel --header : --result testresults/bar echo {a} {b} ::: a 5 6 ::: b 0.70 0.80
|
||||
../parallel --header : --result testresults echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
|
||||
|
|
|
@ -14,7 +14,7 @@ Sample usage:
|
|||
1. Generate some results files by running parallel from the command line:
|
||||
|
||||
# mkdir out
|
||||
# parallel --header : --results out/pfx echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four
|
||||
# parallel --header : --results out echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four
|
||||
|
||||
2. Load the results using the gnuparallel Python package:
|
||||
|
||||
|
@ -24,16 +24,13 @@ Sample usage:
|
|||
Type "help", "copyright", "credits" or "license" for more information.
|
||||
>>> import gnuparallel
|
||||
>>> help(gnuparallel.load)
|
||||
!!! THIS PART IS BROKEN !!!
|
||||
>>> my_df = gnuparallel.load('out')
|
||||
>>> my_df
|
||||
_prefix _stream arg1 arg2 resfile
|
||||
0 pfx stdout 1 three out/pfxstdout arg1 1 arg2 three
|
||||
1 pfx stdout 1 four out/pfxstdout arg1 1 arg2 four
|
||||
2 pfx stdout 2 three out/pfxstdout arg1 2 arg2 three
|
||||
3 pfx stdout 2 four out/pfxstdout arg1 2 arg2 four
|
||||
>>> my_df.tail(1)
|
||||
3 pfx stdout 2 four out/pfxstdout arg1 2 arg2 four
|
||||
_stream arg1 arg2 resfile
|
||||
0 stdout 2 three out/arg1/2/arg2/three/stdout
|
||||
1 stdout 2 four out/arg1/2/arg2/four/stdout
|
||||
2 stdout 1 three out/arg1/1/arg2/three/stdout
|
||||
3 stdout 1 four out/arg1/1/arg2/four/stdout
|
||||
|
||||
See documentation for the pandas project (http://pandas.pydata.org/) for
|
||||
instructions on how to access and manipulate the loaded results.
|
||||
|
|
|
@ -8,7 +8,7 @@ import pandas as pd
|
|||
import os
|
||||
|
||||
def load(_dir, _process=None, _format=None, _stream='stdout',
|
||||
_prefix=None, _infer_types=True, **options):
|
||||
_infer_types=True, **options):
|
||||
"""Load files generated with parallel's --result option.
|
||||
|
||||
One use of GNU parallel is to call one command many times, each
|
||||
|
@ -43,14 +43,6 @@ def load(_dir, _process=None, _format=None, _stream='stdout',
|
|||
_stream : str, optional
|
||||
Specify either "stdout" or "stderr" to load results files from the
|
||||
corresponding stream. Default is "stdout".
|
||||
_prefix : str, optional
|
||||
Only load result files with a specific prefix. When using the --result
|
||||
option to parallel it is possible to specify a prefix for all of the
|
||||
result files. For example,
|
||||
parallel --result /some/dir/a_prefix ...
|
||||
would place all result files into the `/some/dir` directory and all of
|
||||
the file names would begin with "a_prefix". This parameter lets you
|
||||
filter based on this prefix. If None, allow any prefix. Default None.
|
||||
_infer_types : bool, optional
|
||||
Infer data types for option values. All option values are techinically
|
||||
strings (since they were passed on the command line). When _infer_types
|
||||
|
@ -88,14 +80,15 @@ def load(_dir, _process=None, _format=None, _stream='stdout',
|
|||
for k,v in options.iteritems():
|
||||
options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
|
||||
options['_stream'] = [_stream]
|
||||
if _prefix:
|
||||
options['_prefix'] = [_prefix]
|
||||
|
||||
# Iterate over results files and collect the matches.
|
||||
matches = []
|
||||
for file in os.listdir(_dir):
|
||||
metadata = _parse_name(file)
|
||||
metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
|
||||
normdir = os.path.normpath(_dir)
|
||||
for path, file in _find_results(normdir):
|
||||
# Don't include the root path as part of the metadata string.
|
||||
metadata = _parse_path(path[len(normdir):])
|
||||
metadata['_stream'] = file
|
||||
metadata['resfile'] = os.path.join(path, file)
|
||||
if _select(metadata, options):
|
||||
matches.append(metadata)
|
||||
|
||||
|
@ -117,19 +110,16 @@ def load(_dir, _process=None, _format=None, _stream='stdout',
|
|||
|
||||
return df
|
||||
|
||||
def _parse_name(file, sep='\t'):
|
||||
"""Return a dict containing metadata extracted from the file name."""
|
||||
tokens = file.split(sep)
|
||||
prefix_stream = tokens[0]
|
||||
metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
|
||||
def _find_results(root):
|
||||
"""Find all regular files in a directory."""
|
||||
for (path, dirs, files) in os.walk(root):
|
||||
for file in files:
|
||||
yield (path, file)
|
||||
|
||||
stream_index = prefix_stream.find('stdout')
|
||||
if stream_index == -1:
|
||||
stream_index = prefix_stream.find('stderr')
|
||||
prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
|
||||
|
||||
metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
|
||||
return metadata
|
||||
def _parse_path(path):
|
||||
"""Return a dict containing metadata extracted from a file's path."""
|
||||
tokens = path.split(os.path.sep)
|
||||
return {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
|
||||
|
||||
def _select(metadata, filter):
|
||||
"""Return true if the metadata entry matches the filter, False otherwise."""
|
||||
|
|
|
@ -9,20 +9,8 @@ class TestLoader(unittest.TestCase):
|
|||
|
||||
def test_basics(self):
|
||||
df = load(result_dir)
|
||||
self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
|
||||
self.assertEqual(df.shape[0], 8)
|
||||
|
||||
def test_prefix(self):
|
||||
df = load(result_dir, _prefix='foo_')
|
||||
self.assertEqual(set(df.columns), set(['a', 'b', 'resfile', '_stream']))
|
||||
self.assertEqual(df.shape[0], 4)
|
||||
self.assertEqual(df.a.sum(), 6)
|
||||
|
||||
df = load(result_dir, _prefix='bar_')
|
||||
self.assertEqual(df.shape[0], 4)
|
||||
self.assertEqual(df.a.sum(), 22)
|
||||
|
||||
df = load(result_dir, _prefix='BAD')
|
||||
self.assertTrue(df.empty)
|
||||
|
||||
def test_filters(self):
|
||||
df = load(result_dir, a=2)
|
||||
|
@ -60,7 +48,7 @@ class TestLoader(unittest.TestCase):
|
|||
|
||||
def test_process(self):
|
||||
df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
|
||||
self.assertAlmostEqual(df.res[0], 1.4)
|
||||
self.assertAlmostEqual(df.sum()['res'], 2.7)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
5 0.70
|
|
@ -1 +0,0 @@
|
|||
5 0.80
|
|
@ -1 +0,0 @@
|
|||
6 0.70
|
|
@ -1 +0,0 @@
|
|||
6 0.80
|
|
@ -1 +0,0 @@
|
|||
1 0.30
|
|
@ -1 +0,0 @@
|
|||
1 0.40
|
|
@ -1 +0,0 @@
|
|||
2 0.30
|
|
@ -1 +0,0 @@
|
|||
2 0.40
|
Loading…
Reference in a new issue