From a132ed7a2878ac33dee81745500b9e26a80d285a Mon Sep 17 00:00:00 2001
From: Ole Tange <ole@tange.dk>
Date: Mon, 6 Jan 2014 02:57:07 +0100
Subject: [PATCH] load-parallel-results.r: Great updates from David Rosenberg.

---
 src/load-parallel-results.r | 137 ++++++++++++++++++++++++++++++------
 1 file changed, 114 insertions(+), 23 deletions(-)

diff --git a/src/load-parallel-results.r b/src/load-parallel-results.r
index 331e8b47..67aceacb 100644
--- a/src/load-parallel-results.r
+++ b/src/load-parallel-results.r
@@ -1,34 +1,125 @@
+## Copyright (C) 2014 Ole Tange, David Rosenberg, and Free Software
+## Foundation, Inc.
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, see <http://www.gnu.org/licenses/>
+## or write to the Free Software Foundation, Inc., 51 Franklin St,
+## Fifth Floor, Boston, MA 02110-1301 USA
+##
+##
+## LIBRARY FOR READING GNU PARALLEL RESULTS 
+##
+## Example:
+## parallel --results my/results/dir --header : 'printf "FOO={foo}\\tBAR={bar}\\n";paste <(seq {bar}) <(seq {bar} -1 1)' :::: <(echo foo; seq 100) <(echo bar; seq 10)
+##
+## dir="my/results/dir"
+## filenametable <- load_parallel_results_filenames(dir);
+## raw <- load_parallel_results_raw(filenametable)
+## newlines <- load_parallel_results_split_on_newline(filenametable)
+## rawdt <- raw_to_data.table(raw)
+## rawdf <- raw_to_data.frame(raw)
 
-load_parallel_results <- function(resdir) {
+load_parallel_results_filenames <- function(resdir) {
   ## Find files called .../stdout
   stdoutnames <- list.files(path=resdir, pattern="stdout", recursive=T);
-  ## Read them
-  stdoutcontents <-
-    lapply(stdoutnames, function(x) { return(paste(readLines(paste(resdir,x,sep="/")),collapse="\n")) } );
   ## Find files called .../stderr
   stderrnames <- list.files(path=resdir, pattern="stderr", recursive=T);
-  ## Read them
-  stderrcontents <-
-    lapply(stderrnames, function(x) { return(paste(readLines(paste(resdir,x,sep="/")),collapse="\n")) } );
   if(length(stdoutnames) == 0) {
     ## Return empty data frame if no files found
     return(data.frame());
   }
-
-  ## Make the columns containing the variable values
+  ## The argument names are every other dir level
+  ## The argument values are every other dir level
+  ## e.g. my/results/dir/age/18/chromosome/20/stdout
   m <- matrix(unlist(strsplit(stdoutnames, "/")),nrow = length(stdoutnames),byrow=T);
-  mm <- m[,c(F,T)];
-  ## Append the stdout and stderr column
-  mmm <- cbind(mm,unlist(stdoutcontents),unlist(stderrcontents));
-  colnames(mmm) <- c(strsplit(stdoutnames[1],"/")[[1]][c(T,F)],"stderr");
-  ## Example:
-  ## parallel --results my/res/dir --header : 'echo {};seq {myvar2}' ::: myvar1 1 2 ::: myvar2 A B
-   
-  ##  > load_parallel_results("my/res/dir")
-  ##       myvar1 myvar2 stdout      stderr
-  ##  [1,] "1"    "A"    "1 A\n1"    ""
-  ##  [2,] "1"    "B"    "1 B\n1"    ""
-  ##  [3,] "2"    "A"    "2 A\n1\n2" ""
-  ##  [4,] "2"    "B"    "2 B\n1\n2" ""
-  return(mmm);    
+  filenametable <- as.table(m[,c(F,T)]);
+  ## Append the stdout and stderr filenames
+  filenametable <- cbind(filenametable,
+                         paste(resdir,unlist(stdoutnames),sep="/"),
+                         paste(resdir,unlist(stderrnames),sep="/"));
+  colnames(filenametable) <- c(strsplit(stdoutnames[1],"/")[[1]][c(T,F)],"stderr");
+  return(filenametable);
+}
+
+load_parallel_results_raw <- function(filenametable) {
+  ## Read the files given in column stdout
+  stdoutcontents <-
+    lapply(filenametable[,c("stdout")],
+           function(filename) {
+             return(readChar(filename, file.info(filename)$size));
+           } );
+  ## Read the files given in column stderr
+  stderrcontents <-
+    lapply(filenametable[,c("stderr")],
+           function(filename) {
+             return(readChar(filename, file.info(filename)$size));
+           } );
+  ## Replace filenames with file contents
+  filenametable[,c("stdout","stderr")] <-
+    c(as.character(stdoutcontents),as.character(stderrcontents));
+  return(filenametable);
+}
+
+load_parallel_results_split_on_newline <- function(filenametable,split="\n") {
+  raw <- load_parallel_results_raw(filenametable);
+  ## Keep all columns except stdout and stderr
+  varnames = setdiff(colnames(raw), c("stdout","stderr"))
+  ## Find the id of the non-stdout and non-stderr columns
+  header_cols = which(colnames(raw) %in% varnames)
+  ## Split stdout on \n
+  splits = strsplit(raw[,"stdout"], split)
+  ## Compute lengths of all the lines
+  lens = sapply(splits, length)
+  ## The arguments should be repeated as many times as there are lines
+  reps = rep(1:nrow(raw), lens)
+  ## Merge the repeating argument and the lines into a matrix
+  m = cbind(raw[reps, header_cols], unlist(splits))
+  return(m)
+}
+
+raw_to_data.table <- function(raw, ...) {
+  require(data.table)
+  ## Keep all columns except stdout and stderr
+  varnames = setdiff(colnames(raw), c("stdout","stderr"))  
+  ## Remove rownames
+  rownames(raw) = NULL
+  ## after data.table feature request the as.data.frame can be skipped
+  ## and will thus be much faster
+  ddt = as.data.table(as.data.frame(raw,stringsAsFactors=FALSE))
+  ## ensure fread knows stdout is string and not filename by appending \n
+  ddt[, stdout := paste0(stdout,"\n")]
+  ## drop files with empty stdout
+  ddd = ddt[nchar(stdout)>1,fread(stdout, header=FALSE, ...), by=varnames]
+  return(ddd)
+}
+
+raw_to_data.frame <- function(raw, ...) {
+  require(plyr)
+  ## Convert to data.frame without factors
+  raw = as.data.frame(raw, stringsAsFactors = FALSE)
+  ## Keep all columns except stdout and stderr
+  varnames = setdiff(colnames(raw), c("stdout","stderr"))  
+  
+  dd = ddply(raw, .variables=varnames, function(row) {
+    ## Ignore empty stdouts
+    if (nchar(row[,"stdout"]) == 0) {
+      return(NULL)
+    }
+    ## Read stdout with read.table
+    con <- textConnection(row[,"stdout"], open = "r")
+    d = read.table(con, header=FALSE, ...)
+    return(d)
+  })
+
+  return(dd)
 }