Hackout3 · zkamvar · Jul 16, 2016 · Jul 17, 2016 · Jul 17, 2016 · Aug 4, 2016
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,12 +6,14 @@ export(dateDists)
 export(distFuns)
 export(genderDists)
 export(genericDists)
+export(getIndexList)
 export(launch)
 export(locationDists)
 export(matchEpiData)
 export(nameDists)
 export(processFunctionList)
 export(returnMatches)
+export(tablesFromMatch)
 importFrom(lubridate,parse_date_time)
 importFrom(shiny,runApp)
 importFrom(stats,dist)

diff --git a/R/epi_self_match.R b/R/epi_self_match.R
@@ -16,6 +16,7 @@
 #'
 #' @examples
 #' ## Loading Data
+#' 
 #' indata <- system.file("files", package = "epimatch")
 #' indata <- dir(indata, full.names = TRUE)
 #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
@@ -25,32 +26,29 @@
 #' case <- x[["CaseInformationForm.csv"]]
 #' lab <- x[["LaboratoryResultsForm7.csv"]]
 #'
+#' funlist <- list(
+#'              list(d1vars = "ID",
+#'                   d2vars = "ID",
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 1),
+#'              list(d1vars = c("Surname", "OtherNames"),
+#'                   d2vars = c("SurnameLab", "OtherNameLab"),
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 0.5)
+#'            )
 #' # This will get all of the indices that match the ID and Names with a
 #' # threshold of 0.25
 #' res <- matchEpiData(dat1 = case,
 #'                     dat2 = lab,
-#'                     funlist = list(
-#'                     list(d1vars = "ID",
-#'                          d2vars = "ID",
-#'                          fun = "nameDists",
-#'                          extraparams = NULL,
-#'                          weight = 1),
-#'                     list(d1vars = c("Surname", "OtherNames"),
-#'                          d2vars = c("SurnameLab", "OtherNameLab"),
-#'                          fun = "nameDists",
-#'                          extraparams = NULL,
-#'                          weight = 0.5)
-#'                     ),
-#'                     thresh = 0.25)
+#'                     funlist = funlist,
+#'                     thresh = 0.25,
+#'                     giveWeight = FALSE)
 #' # List of indices
 #' res
 #'
-#' # Printing out the matching names in decreasing order of matching
-#' invisible(lapply(res, function(i) {
-#'    print(case[i$d1, c("Surname", "OtherNames")])
-#'    print(lab[i$d2, c("SurnameLab", "OtherNameLab")])
-#'    cat("\n\t--------\n")
-#'  }))
+#' tablesFromMatch(case, lab, funlist, matchList = res)
 matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giveWeight = FALSE){
   the_matrices <- processFunctionList(dat1, dat2, funlist)
   the_weights  <- unlist(lapply(funlist, function(i) i$weight))
@@ -59,8 +57,57 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv
   # For processing in the shiny app, it's useful to display either the weights
   # as a named index vector or the indices themselves
   if (!giveWeight){
-    names(out) <- NULL
-    out <- lapply(out, lapply, function(i) as.integer(names(i)))
+    out <- getIndexList(out)
   }
   return(out)
 }
+
+#' return a list of indices from matchEpiData
+#'
+#' The output of matchEpiData can either be a list of indices or a named list of
+#' weights. If it's a named list of weights, this function will return the list
+#' of indices.
+#'
+#' @param matchList a list of lists of weights
+#'
+#' @return an unnamed list of lists of indices
+#' @keywords internal utilities
+#' @export
+#' @examples
+#' ## Loading Data
+#' 
+#' indata <- system.file("files", package = "epimatch")
+#' indata <- dir(indata, full.names = TRUE)
+#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
+#' names(x) <- basename(indata)
+#'
+#' # We will use one data set from the case information and lab results
+#' case <- x[["CaseInformationForm.csv"]]
+#' lab <- x[["LaboratoryResultsForm7.csv"]]
+#'
+#' funlist <- list(
+#'              list(d1vars = "ID",
+#'                   d2vars = "ID",
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 1),
+#'              list(d1vars = c("Surname", "OtherNames"),
+#'                   d2vars = c("SurnameLab", "OtherNameLab"),
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 0.5)
+#'            )
+#' # This will get all of the indices that match the ID and Names with a
+#' # threshold of 0.25
+#' res <- matchEpiData(dat1 = case,
+#'                     dat2 = lab,
+#'                     funlist = funlist,
+#'                     thresh = 0.25,
+#'                     giveWeight = TRUE)
+#' res               # List of weights
+#' getIndexList(res) # List of indices
+getIndexList <- function(matchList){
+  out <- lapply(matchList, lapply, function(i) as.integer(names(i)))
+  names(out) <- NULL
+  return(out)
+}
diff --git a/R/epimatch-package.r b/R/epimatch-package.r
@@ -25,6 +25,7 @@
 #'   \itemize{
 #'      \item \code{\link{processFunctionList}}
 #'      \item \code{\link{matchEpiData}}
+#'      \item \code{\link{tablesFromMatch}}
 #'   }
 #'
 #' @section Dissimilarity Functions: Each dissimilarity function returns a

diff --git a/R/internal.R b/R/internal.R
@@ -66,3 +66,27 @@ splitComma <- function(dat)
   }
   return(new_dat)
 }
+
+#' Collapse columns of a data frame into one
+#'
+#' @param df a data frame with one or more columns
+#' @param sep a separator to collapse the columns with
+#'
+#' @return a single column data frame
+#' @keywords internal
+#' @noRd
+#'
+#' @examples
+#' dat <- data.frame(a = letters, b = sample(100, 26), c = LETTERS)
+#' collapseValues(dat)
+collapseValues <- function(df, sep = "_"){
+  if (!is.data.frame(df)){
+    stop("a data frame is needed")
+  }
+  if (length(df) > 1){
+    dfnames <- paste(names(df), collapse = sep)
+    df <- data.frame(apply(df, 1, paste, collapse = sep), stringsAsFactors = FALSE)
+    names(df) <- dfnames
+  }
+  return(df)
+}
diff --git a/R/matchAge.R b/R/matchAge.R
@@ -14,6 +14,7 @@
 #' @export
 #'
 #' @examples
+#' 
 #' set.seed(999)
 #' x <- data.frame(age = rpois(5, 30), age_class = "YEAR")
 #' y <- data.frame(age = rpois(5, 18), age_class = "MONTH")

diff --git a/R/matchDate.R b/R/matchDate.R
@@ -14,6 +14,7 @@
 #' @importFrom lubridate parse_date_time
 #'
 #' @examples
+#' 
 #'
 #' # Two data sets in Month-Day-Year and Year-Month-Day format, respectively.
 #' # Note that the way the dates can vary within data sets, but it's assumed

diff --git a/R/matchGender.R b/R/matchGender.R
@@ -11,6 +11,7 @@
 #' @export
 #'
 #' @examples
+#' 
 #' test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE)
 #' genderDists(test)
 genderDists <- function(dat1, dat2=NULL,

diff --git a/R/matchGeneric.R b/R/matchGeneric.R
@@ -7,6 +7,7 @@
 #' @export
 #'
 #' @examples
+#' 
 #' set.seed(9)
 #' x <- data.frame(dat = sample(10, replace = TRUE))
 #' x$let <- letters[x$dat]

diff --git a/R/matchNames.R b/R/matchNames.R
@@ -8,6 +8,7 @@
 #' @export
 #'
 #' @examples
+#' 
 #' set.seed(9)
 #' x <- data.frame(x = letters, y = LETTERS, z = 1:26)
 #' x <- x[sample(nrow(x), 10, replace = TRUE), ]

diff --git a/R/process_matching.R b/R/process_matching.R
@@ -20,6 +20,7 @@
 #' @export
 #'
 #' @examples
+#' 
 #' ## Loading Data
 #' indata <- system.file("files", package = "epimatch")
 #' indata <- dir(indata, full.names = TRUE)

diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R
@@ -0,0 +1,137 @@
+#' Generate a list of tables aggregating the matched values
+#'
+#' @inheritParams processFunctionList
+#' @param matchList a list derived from \code{\link{matchEpiData}}. If this is
+#'   \code{NULL}, a threshold can be supplied instead to calculate the list on
+#'   the fly.
+#' @param collapse When \code{TRUE}, the list of data frames will be collapsed
+#'   into one data frame an extra column specifying the group appended.
+#'
+#' @details This will collect all of the data from \code{\link{matchEpiData}}
+#'   and present it in table format. It will collapse them into tidy tables and,
+#'   if provided, the score of the matches will be provided.
+#' @return a list of data frames sorted in decreasing order of matching-ness
+#'   with three extra columns:
+#'   \describe{
+#'       \item{source}{ data set of origin}
+#'       \item{index}{ index in data set}
+#'       \item{score}{ the score of the matches to the first item in the group.}
+#'   }
+#'   When \code{collapse = TRUE}, a fourth column, \code{group} is appended.
+#'
+#' @export
+#'
+#' @examples
+#' ## Loading Data
+#'
+#' indata <- system.file("files", package = "epimatch")
+#' indata <- dir(indata, full.names = TRUE)
+#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
+#' names(x) <- basename(indata)
+#'
+#' # We will use one data set from the case information and lab results
+#' case <- x[["CaseInformationForm.csv"]]
+#' lab <- x[["LaboratoryResultsForm7.csv"]]
+#'
+#' funlist <- list(
+#'              list(d1vars = "ID",
+#'                   d2vars = "ID",
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 1),
+#'              list(d1vars = c("Surname", "OtherNames"),
+#'                   d2vars = c("SurnameLab", "OtherNameLab"),
+#'                   fun = "nameDists",
+#'                   extraparams = NULL,
+#'                   weight = 0.5)
+#'            )
+#' # This will get all of the indices that match the ID and Names with a
+#' # threshold of 0.25
+#' res <- matchEpiData(dat1 = case,
+#'                     dat2 = lab,
+#'                     funlist = funlist,
+#'                     thresh = 0.25,
+#'                     giveWeight = TRUE)
+#' tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE)
+#' tablesFromMatch(case, lab, funlist, matchList = 0.25)
+tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(),
+                            matchList = NULL, collapse = TRUE){
+
+  # Check if incoming data are cromulent
+  dat2exists <- !is.null(dat2) && is.data.frame(dat2)
+  if (!is.data.frame(dat1) | !dat2exists){
+    stop("data must be in data frame format")
+  }
+  # Check if there are any matches at all
+  if (is.null(matchList) || length(matchList) == 0){
+    stop("no matches found!")
+  }
+  # If the matchList is a threshold parameter, pass it to the matchEpiData
+  # function.
+  if (is.numeric(matchList) && length(matchList) == 1){
+    matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList,
+                              giveWeight = TRUE)
+  }
+  # If the incoming list contains numeric values, collect the thresholds and
+  # convert the list to indices.
+  if (!is.integer(unlist(matchList))){
+    theThresholds <- matchList
+    matchList     <- getIndexList(matchList)
+  }
+
+  # Grab the tested variables from dat1
+  d1vars <- lapply(funlist, "[[", "d1vars")
+  d1 <- lapply(d1vars, function(i) dat1[i])
+  # Since multiple columns can be evaluated by a single function, we are going
+  # to collapse these columns into one.
+  d1 <- data.frame(lapply(d1, collapseValues))
+
+  if (dat2exists){
+    # Grab the tested variables from dat2 and rename them based off of dat1.
+    # This allows us to bind the data together a lot easier.
+    d2vars <- lapply(funlist, "[[", "d2vars")
+    d2 <- lapply(d2vars, function(i) dat2[i])
+    d2 <- data.frame(lapply(d2, collapseValues))
+    names(d2) <- names(d1)
+  }
+
+  ## The output is a list of data frames subset in decreasing order of matching.
+  outlist <- vector(length = length(matchList), mode = "list")
+  for (i in seq(matchList)){
+    indices <- matchList[[i]]
+    if (dat2exists){
+      outdat <- rbind(d1[indices$d1, ], d2[indices$d2, ])
+    } else {
+      outdat <- d1[indices$d1, ]
+    }
+    ## Give the indices within each data set ------------------------
+    outdat$index <- unlist(indices, use.names = FALSE)
+
+    ## Give the source of the data ----------------------------------
+    outdat$source <- rep(c("d1", "d2"), lapply(indices, length))
+
+    ## If the incoming list are thresholds, add them as a column ----
+    if (exists("theThresholds", inherits = FALSE)){
+      outdat$score <- unlist(theThresholds[[i]])
+    }
+    rownames(outdat) <- NULL
+    outlist[[i]] <- outdat
+  }
+
+  if (collapse){
+    # This is where we make the data tidy by placing everything in a single data
+    # frame and adding three extra columns that contain the information that was
+    # in the result from `matchEpiData()`.
+
+    ## Give the indices of the highest list -------------------------
+    groups <- lapply(lapply(matchList, unlist), length)
+    groups <- rep(seq_along(matchList), groups)
+
+    ## Create the resulting tidy data frame and add information -----
+    outlist <- do.call("rbind", outlist)
+    outlist$group  <- groups
+    rownames(outlist) <- NULL
+  }
+
+  return(outlist)
+}
diff --git a/epimatch.Rproj b/epimatch.Rproj
@@ -17,5 +17,6 @@ StripTrailingWhitespace: Yes
 
 BuildType: Package
 PackageUseDevtools: Yes
-PackageInstallArgs: --no-multiarch --with-keep.source
+PackageInstallArgs: --no-multiarch --with-keep.source --install-tests
+PackageCheckArgs: --as-cran
 PackageRoxygenize: rd,collate,namespace
diff --git a/inst/shiny/server.R b/inst/shiny/server.R
@@ -251,11 +251,13 @@ function(input, output, session) {
       hide("findMatchesLoading")
     })
     tryCatch({
-      values$results <- epimatch::matchEpiData(
+      theMatches <- epimatch::matchEpiData(
         dat1 = values$data1, dat2 = values$data2,
         funlist = funlist,
-        thresh = input$threshold
+        thresh = input$threshold,
+        giveWeight = TRUE
       )
+      values$results <- epimatch::getIndexList(theMatches)
       show(selector = ".findMatchesDone")
       delay(1000, hide(selector = ".findMatchesDone", anim = TRUE, animType = "fade",
                        time = 0.5))

diff --git a/man/ageDists.Rd b/man/ageDists.Rd
diff --git a/man/dateDists.Rd b/man/dateDists.Rd