Merge branch 'master' of ssh://git.sv.gnu.org/srv/git/parallel

Conflicts: src/parallel testsuite/tests-to-run/parallel-local2.sh
2024-12-26 06:37:56 +00:00 · 2012-10-17 02:16:01 +02:00 · 2012-10-17 02:16:01 +02:00 · d9675d0bd0
parent fcaf41b325 d2678e53b0
commit d9675d0bd0
26 changed files with 353 additions and 19 deletions
--- a/src/optional/genresults.sh
+++ b/src/optional/genresults.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+#
+# Generate the result files used to test the query modules.
+
+../parallel --header : --result testresults/foo_ echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
+../parallel --header : --result testresults/bar_ echo {a} {b} ::: a 5 6 ::: b 0.70 0.80
--- a/src/optional/python/.gitignore
+++ b/src/optional/python/.gitignore
@ -0,0 +1,4 @@
+*.pyc
+build
+dist
+*.egg-info
--- a/src/optional/python/README
+++ b/src/optional/python/README
@ -0,0 +1,40 @@
+gnuparallel : Simple loading of GNU parallel result files.
+
+The gnuparallel package provides a single function, `load`, which
+loads results from files generated by GNU parallel into a Pandas
+DataFrame object. See `help(gnuparallel.load)` for details.
+
+Installation:
+
+    # python setup.py install
+
+Sample usage:
+
+1. Generate some results files by running parallel from the command line:
+
+    # mkdir outdir
+    # parallel --results outdir/ echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four
+
+2. Load the results using the gnuparallel Python package:
+
+    # python
+    Python 2.7.3 (default, Apr 24 2012, 00:00:54) 
+    [GCC 4.7.0 20120414 (prerelease)] on linux2
+    Type "help", "copyright", "credits" or "license" for more information.
+    >>> import gnuparallel
+    >>> help(gnuparallel.load)
+    >>> myresults = gnuparallel.load('outdir')
+    >>> myresults
+          1      2  _prefix _stream                              resfile
+    0     2   four      NaN  stdout     outdir/stdout\t1\t2\t2\tfour
+    1     2  three      NaN  stdout     outdir/stdout\t1\t2\t2\tthree
+    2     2   arg2      NaN  stdout     outdir/stdout\t1\t2\t2\targ2
+    3     1   four      NaN  stdout     outdir/stdout\t1\t1\t2\tfour
+    4     1  three      NaN  stdout     outdir/stdout\t1\t1\t2\tthree
+    5     1   arg2      NaN  stdout     outdir/stdout\t1\t1\t2\targ2
+    6  arg1   four      NaN  stdout     outdir/stdout\t1\targ1\t2\tfour
+    7  arg1  three      NaN  stdout     outdir/stdout\t1\targ1\t2\tthree
+    8  arg1   arg2      NaN  stdout     outdir/stdout\t1\targ1\t2\targ2
+
+See documentation for the pandas project (http://pandas.pydata.org/) for
+instructions on how to access and manipulate the loaded results.
--- a/src/optional/python/gnuparallel/init.py
+++ b/src/optional/python/gnuparallel/init.py
@ -0,0 +1,3 @@
+"""Load GNU Parallel --results files into a Pandas DataFrame."""
+
+from _loader import *
--- a/src/optional/python/gnuparallel/_loader.py
+++ b/src/optional/python/gnuparallel/_loader.py
@ -0,0 +1,147 @@
+"""
+A function for loading the --result files generated by GNU Parallel.
+"""
+__all__ = ['load']
+
+from cStringIO import StringIO
+import pandas as pd
+import os
+
+def load(_dir, _process=None, _format=None, _stream='stdout',
+        _prefix=None, _infer_types=True, **options):
+    """Load files generated with parallel's --result option.
+
+    One use of GNU parallel is to call one command many times, each
+    time with a different set of arguments. With the --result option,
+    parallel will capture stdout and stderr from these processes and
+    store them in files named according to the arguments of each
+    individual call. This function provides easy loading of these
+    result files into a Pandas DataFrame.
+
+    Parameters
+    ----------
+    _dir : str
+        Directory containing the results files.
+    _process : function, optional
+        Function that opens a results file and returns an object containing
+        its results. If not provided, the resulting data frame will include
+        a column containing the file names, not the actual results.
+
+        If provided, the function should take a filename as its sole parameter.
+        Whatever the function returns will be stored in the "res" column of
+        the resulting DataFrame.
+    _format : dict, optional
+        Dictionary of format strings, used to convert any provided filter
+        values to a format matching the results file names.
+        
+        For example, if the `foo` parameter to parallel was "0.10" and you pass
+        foo=0.10 as an option, you will not find the intended file because
+        str(0.10) == "0.1". To fix this, you should also include the key-value
+        pair "foo": "%.2f" in the _format dict. This is usually only necessary
+        for float-valued arguments where rounding or precision issues might
+        affect the matching process.
+    _stream : str, optional
+        Specify either "stdout" or "stderr" to load results files from the
+        corresponding stream. Default is "stdout".
+    _prefix : str, optional
+        Only load result files with a specific prefix. When using the --result
+        option to parallel it is possible to specify a prefix for all of the
+        result files. For example,
+            parallel --result /some/dir/a_prefix ...
+        would place all result files into the `/some/dir` directory and all of
+        the file names would begin with "a_prefix". This parameter lets you
+        filter based on this prefix. If None, allow any prefix. Default None.
+    _infer_types : bool, optional
+        Infer data types for option values. All option values are techinically
+        strings (since they were passed on the command line). When _infer_types
+        is True, the resulting DataFrame will convert these values to inferred
+        dtypes, e.g. the number 1 instead of "1". Default True.
+    **options : kwargs
+        Additional keyword arguments that will be used to filter the subset
+        of results included in the output. The values can be either single
+        values or iterables. If they are iterable, files corresponding to any
+        of the included values will be considered a match.
+
+        For example, passing `foo=[1,2,3]` will include results from files
+        corresponding to runs where the parallel argument named `foo` had
+        the value "1", "2", or "3".
+
+        See also the _format parameter.
+
+    Returns
+    -------
+    res : pandas.DataFrame
+        A DataFrame with one column named for each of the parallel arguments
+        and, depending on the _process argument, either:
+        - A "res" column containing the results corresponding to each run.
+        - A "resfile" column containing the names of the results files.
+    """
+    if _format is None:
+        _format = dict()
+
+    # Process the filter options.
+    for k,v in options.iteritems():
+        if hasattr(v, '__iter__') and not isinstance(v, basestring):
+            pass # v is already a container type.
+        else:
+            options[k] = [v]
+    for k,v in options.iteritems():
+        options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
+    options['_stream'] = [_stream]
+    if _prefix:
+        options['_prefix'] = [_prefix]
+
+    # Iterate over results files and collect the matches.
+    matches = []
+    for file in os.listdir(_dir):
+        metadata = _parse_name(file)
+        metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
+        if _select(metadata, options):
+            matches.append(metadata)
+
+    # Create a DataFrame from the matches.
+    df = pd.DataFrame(matches)
+
+    # Optionally try to convert string argument values to numeric types.
+    if _infer_types:
+        buf = StringIO()
+        df.to_csv(buf)
+        df = pd.read_csv(StringIO(buf.getvalue()), index_col=0)
+
+    # Open and process the results. This needs to happen after the type
+    # infererence phase since the processed results can be arbitrary
+    # Python objects and might not survive the round-trip.
+    if _process and not df.empty:
+        df['res'] = df.resfile.apply(_process)
+        df = df.drop('resfile', axis=1)
+
+    return df
+
+def _parse_name(file, sep='\t'):
+    """Return a dict containing metadata extracted from the file name."""
+    tokens = file.split(sep)
+    prefix_stream = tokens[0]
+    metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
+
+    stream_index = prefix_stream.find('stdout')
+    if stream_index == -1:
+        stream_index = prefix_stream.find('stderr')
+    prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
+
+    metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
+    return metadata
+    
+def _select(metadata, filter):
+    """Return true if the metadata entry matches the filter, False otherwise."""
+    if any(k not in metadata for k in filter):
+        return False
+    if any(all(v != metadata[k] for v in vs) for k,vs in filter.iteritems()):
+        return False
+    return True
+
+def _stringify(x, fmt):
+    """Return the string representation of x, using a format string if provided"""
+    if fmt:
+        return fmt % x
+    else:
+        return str(x)
--- a/src/optional/python/setup.py
+++ b/src/optional/python/setup.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+setup(
+    name = 'gnuparallel',
+    version = '0.1',
+    description = 'Load GNU parallel result files.',
+    author = 'Drew Frank',
+    author_email = 'drewfrank@gmail.com',
+    packages = [
+        'gnuparallel'
+    ],
+    install_requires = ['pandas']
+)
--- a/src/optional/python/tests/test_loader.py
+++ b/src/optional/python/tests/test_loader.py
@ -0,0 +1,66 @@
+import pandas as pd
+import unittest
+
+from gnuparallel import load
+
+result_dir = '../../testresults'
+
+class TestLoader(unittest.TestCase):
+
+    def test_basics(self):
+        df = load(result_dir)
+        self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
+        self.assertEqual(df.shape[0], 8)
+
+    def test_prefix(self):
+        df = load(result_dir, _prefix='foo_')
+        self.assertEqual(df.shape[0], 4)
+        self.assertEqual(df.a.sum(), 6)
+
+        df = load(result_dir, _prefix='bar_')
+        self.assertEqual(df.shape[0], 4)
+        self.assertEqual(df.a.sum(), 22)
+
+        df = load(result_dir, _prefix='BAD')
+        self.assertTrue(df.empty)
+
+    def test_filters(self):
+        df = load(result_dir, a=2)
+        self.assertEqual(df.shape[0], 2)
+        self.assertEqual(df.a.sum(), 4)
+
+        df = load(result_dir, a=[2])
+        self.assertEqual(df.shape[0], 2)
+        self.assertEqual(df.a.sum(), 4)
+
+        df = load(result_dir, a=[1,2])
+        self.assertEqual(df.shape[0], 4)
+        self.assertEqual(df.a.sum(), 6)
+
+        df = load(result_dir, a=1000)
+        self.assertTrue(df.empty)
+
+    def test_infer_types(self):
+        df = load(result_dir)
+        self.assertEqual(df.a.dtype, pd.np.int64)
+
+        df = load(result_dir, _infer_types=False)
+        self.assertEqual(df.a.dtype, pd.np.object_)
+
+    def test_format(self):
+        df = load(result_dir, b=0.3)
+        self.assertTrue(df.empty)
+
+        df = load(result_dir, b=0.3, _format={'b': '%.2f'})
+        self.assertEqual(df.shape[0], 2)
+
+    def test_stream(self):
+        df = load(result_dir, _stream='stderr')
+        self.assertTrue((df._stream == 'stderr').all())
+
+    def test_process(self):
+        df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
+        self.assertAlmostEqual(df.res[0], 1.4)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stderr
+++ b/src/optional/testresults/bar_stderr
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
+5 0.70
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
+5 0.80
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
+6 0.70
--- a/src/optional/testresults/bar_stdout
+++ b/src/optional/testresults/bar_stdout
@ -0,0 +1 @@
+6 0.80
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stderr
+++ b/src/optional/testresults/foo_stderr
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
+1 0.30
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
+1 0.40
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
+2 0.30
--- a/src/optional/testresults/foo_stdout
+++ b/src/optional/testresults/foo_stdout
@ -0,0 +1 @@
+2 0.40
--- a/src/parallel.pod
+++ b/src/parallel.pod
@ -445,6 +445,15 @@ If I<eof-str> is omitted, there is no end of file string.  If neither
 B<-E> nor B<-e> is used, no end of file string is used.


+=item B<--env> I<var>
+
+Copy environment variable I<var>. This will copy I<var> to the
+environment that the command is run in. This is especially useful for
+remote environments.
+
+Caveat: If I<var> contains newline ('\n') the value is messed up.
+
+
 =item B<--eta>

 Show the estimated number of seconds before finishing. This forces GNU
@ -979,9 +988,9 @@ Only used with B<--pipe>.
 Results in files named by tab separated arguments. Save the output
 into files. The file names will be prefixed with I<prefix> which can
 contain a path with a prefix string. The file with output from stdout
-(standard output) will prefixed with 'I<prefix>stdout_'.  The file
+(standard output) will prefixed with 'I<prefix>stdout'.  The file
 with output from stderr (standard error) will prefixed with
-'I<prefix>stderr_'.
+'I<prefix>stderr'.

 The postfix is the header of the input source (if using B<--header :>)
 or the number of the input source followed by the value of the input
@ -994,14 +1003,14 @@ E.g:

 will generate the files:

-  foo/barstderr_a	I	b	III
-  foo/barstderr_a	I	b	IIII
-  foo/barstderr_a	II	b	III
-  foo/barstderr_a	II	b	IIII
-  foo/barstdout_a	I	b	III
-  foo/barstdout_a	I	b	IIII
-  foo/barstdout_a	II	b	III
-  foo/barstdout_a	II	b	IIII
+  foo/barstderr	a	I	b	III
+  foo/barstderr	a	I	b	IIII
+  foo/barstderr	a	II	b	III
+  foo/barstderr	a	II	b	IIII
+  foo/barstdout	a	I	b	III
+  foo/barstdout	a	I	b	IIII
+  foo/barstdout	a	II	b	III
+  foo/barstdout	a	II	b	IIII

 and

@ -1009,14 +1018,14 @@ and

 will generate the files:

-  foo/barstderr_1	I	2	III
-  foo/barstderr_1	I	2	IIII
-  foo/barstderr_1	II	2	III
-  foo/barstderr_1	II	2	IIII
-  foo/barstdout_1	I	2	III
-  foo/barstdout_1	I	2	IIII
-  foo/barstdout_1	II	2	III
-  foo/barstdout_1	II	2	IIII
+  foo/barstderr	1	I	2	III
+  foo/barstderr	1	I	2	IIII
+  foo/barstderr	1	II	2	III
+  foo/barstderr	1	II	2	IIII
+  foo/barstdout	1	I	2	III
+  foo/barstdout	1	I	2	IIII
+  foo/barstdout	1	II	2	III
+  foo/barstdout	1	II	2	IIII

 where all spaces are TABs (\t);.

@ -1850,6 +1859,16 @@ This also works if the input file is a file with columns:
  cat addressbook.tsv | parallel --colsep '\t' --header : echo {Name} {E-mail address}


+=head1 EXAMPLE: Count the differences between all files in a dir
+
+Using B<--results> the results are saved in /tmp/diffcount*.
+
+  parallel --results /tmp/diffcount "diff -U 0 {1} {2} |tail -n +3 |grep -v '^@'|wc -l" ::: * ::: *
+
+To see the difference between file A and file B look at the file
+'/tmp/diffcount 1 A 2 B' where spaces are TABs (\t).
+
+
 =head1 EXAMPLE: Speeding up fast jobs

 Starting a job on the local machine takes around 3 ms. This can be a
--- a/testsuite/tests-to-run/parallel-local2.sh
+++ b/testsuite/tests-to-run/parallel-local2.sh
@ -52,9 +52,26 @@ perl -e 'for(160) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice

 echo '### Test too slow spawning'
 killall -9 burnP6 2>/dev/null
-seq 1 2 | parallel -j2 -N0 timeout -k 25 26 burnP6 &
+seq `parallel --number-of-cores` | parallel -j100% -N0 timeout -k 25 26 burnP6 &
 sleep 1
 seq 1 1000 |
 stdout nice nice  parallel -s 100 -uj0 true |
 perl -pe '/parallel: Warning: Starting \d+ processes took/ and do {close STDIN; `killall -9 burnP6`; print "OK\n"; exit }'
 killall -9 burnP6 2>/dev/null
+
+echo '### Test --env  - https://savannah.gnu.org/bugs/?37351'
+export TWOSPACES='  2  spaces  '
+export THREESPACES=" >  My brother's 12\" records  < "
+stdout parallel --env TWOSPACES echo 'a"$TWOSPACES"b' ::: 1
+stdout parallel --env TWOSPACES --env THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2
+stdout parallel --env TWOSPACES,THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2a
+stdout parallel -S localhost --env TWOSPACES echo 'a"$TWOSPACES"b' ::: 1
+stdout parallel -S localhost --env TWOSPACES --env THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2
+stdout parallel -S localhost --env TWOSPACES,THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2a
+
+echo '### Test --env all chars except \n - single and double - no output is good'
+perl -e 'for(1..9,9,11..255) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice 19 -j4 -k -I // --arg-sep _ -0 V=// V2=V2=// parallel -k -j1 -S :,1/lo,1/tcsh@lo,1/csh@lo --env V,V2 echo \''"{}$V$V2"'\' ::: {#} {#} {#} {#} | uniq -c | grep -v '   4 '|grep -v xauth |grep -v X11
+
+echo '### Test --env all chars except \n - single and double --onall - no output is good'
+perl -e 'for(1..9,9,11..255) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice 19 -j4 -k -I // --arg-sep _ -0 V=// V2=V2=// parallel -k -j1 -S :,1/lo,1/tcsh@lo,1/csh@lo --onall --env V,V2 echo \''"{}$V$V2"'\' ::: {#} | uniq -c | grep -v '   4 '|grep -v xauth |grep -v X11
+
--- a/testsuite/wanted-results/parallel-local2
+++ b/testsuite/wanted-results/parallel-local2
@ -46,3 +46,12 @@ a'   * ? >o  <i*? ][\!#¤%=( ) | }b 5
 ### Test --env for \160  - which kills csh - single and double --onall - no output is good
 ### Test too slow spawning
 OK
+### Test --env  - https://savannah.gnu.org/bugs/?37351
+a  2  spaces  b 1
+a  2  spaces  b a >  My brother's 12" records  < b 2
+a  2  spaces  b a >  My brother's 12" records  < b 2a
+a  2  spaces  b 1
+a  2  spaces  b a >  My brother's 12" records  < b 2
+a  2  spaces  b a >  My brother's 12" records  < b 2a
+### Test --env all chars except \n - single and double - no output is good
+### Test --env all chars except \n - single and double --onall - no output is good