diff --git a/NAMESPACE b/NAMESPACE index c03d96a..3a4c84a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,12 +6,14 @@ export(dateDists) export(distFuns) export(genderDists) export(genericDists) +export(getIndexList) export(launch) export(locationDists) export(matchEpiData) export(nameDists) export(processFunctionList) export(returnMatches) +export(tablesFromMatch) importFrom(lubridate,parse_date_time) importFrom(shiny,runApp) importFrom(stats,dist) diff --git a/R/epi_self_match.R b/R/epi_self_match.R index f3b57c4..75707d1 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -16,6 +16,7 @@ #' #' @examples #' ## Loading Data +#' #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) @@ -25,32 +26,29 @@ #' case <- x[["CaseInformationForm.csv"]] #' lab <- x[["LaboratoryResultsForm7.csv"]] #' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) #' # This will get all of the indices that match the ID and Names with a #' # threshold of 0.25 #' res <- matchEpiData(dat1 = case, #' dat2 = lab, -#' funlist = list( -#' list(d1vars = "ID", -#' d2vars = "ID", -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 1), -#' list(d1vars = c("Surname", "OtherNames"), -#' d2vars = c("SurnameLab", "OtherNameLab"), -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 0.5) -#' ), -#' thresh = 0.25) +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = FALSE) #' # List of indices #' res #' -#' # Printing out the matching names in decreasing order of matching -#' invisible(lapply(res, function(i) { -#' print(case[i$d1, c("Surname", "OtherNames")]) -#' print(lab[i$d2, c("SurnameLab", "OtherNameLab")]) -#' cat("\n\t--------\n") -#' })) +#' tablesFromMatch(case, lab, funlist, matchList = res) matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giveWeight = FALSE){ the_matrices <- processFunctionList(dat1, dat2, funlist) the_weights <- unlist(lapply(funlist, function(i) i$weight)) @@ -59,8 +57,57 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv # For processing in the shiny app, it's useful to display either the weights # as a named index vector or the indices themselves if (!giveWeight){ - names(out) <- NULL - out <- lapply(out, lapply, function(i) as.integer(names(i))) + out <- getIndexList(out) } return(out) } + +#' return a list of indices from matchEpiData +#' +#' The output of matchEpiData can either be a list of indices or a named list of +#' weights. If it's a named list of weights, this function will return the list +#' of indices. +#' +#' @param matchList a list of lists of weights +#' +#' @return an unnamed list of lists of indices +#' @keywords internal utilities +#' @export +#' @examples +#' ## Loading Data +#' +#' indata <- system.file("files", package = "epimatch") +#' indata <- dir(indata, full.names = TRUE) +#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +#' names(x) <- basename(indata) +#' +#' # We will use one data set from the case information and lab results +#' case <- x[["CaseInformationForm.csv"]] +#' lab <- x[["LaboratoryResultsForm7.csv"]] +#' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) +#' # This will get all of the indices that match the ID and Names with a +#' # threshold of 0.25 +#' res <- matchEpiData(dat1 = case, +#' dat2 = lab, +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = TRUE) +#' res # List of weights +#' getIndexList(res) # List of indices +getIndexList <- function(matchList){ + out <- lapply(matchList, lapply, function(i) as.integer(names(i))) + names(out) <- NULL + return(out) +} diff --git a/R/epimatch-package.r b/R/epimatch-package.r index 2bc77cb..ec75319 100644 --- a/R/epimatch-package.r +++ b/R/epimatch-package.r @@ -25,6 +25,7 @@ #' \itemize{ #' \item \code{\link{processFunctionList}} #' \item \code{\link{matchEpiData}} +#' \item \code{\link{tablesFromMatch}} #' } #' #' @section Dissimilarity Functions: Each dissimilarity function returns a diff --git a/R/internal.R b/R/internal.R index 89fbcbc..3e7b62b 100644 --- a/R/internal.R +++ b/R/internal.R @@ -66,3 +66,27 @@ splitComma <- function(dat) } return(new_dat) } + +#' Collapse columns of a data frame into one +#' +#' @param df a data frame with one or more columns +#' @param sep a separator to collapse the columns with +#' +#' @return a single column data frame +#' @keywords internal +#' @noRd +#' +#' @examples +#' dat <- data.frame(a = letters, b = sample(100, 26), c = LETTERS) +#' collapseValues(dat) +collapseValues <- function(df, sep = "_"){ + if (!is.data.frame(df)){ + stop("a data frame is needed") + } + if (length(df) > 1){ + dfnames <- paste(names(df), collapse = sep) + df <- data.frame(apply(df, 1, paste, collapse = sep), stringsAsFactors = FALSE) + names(df) <- dfnames + } + return(df) +} diff --git a/R/matchAge.R b/R/matchAge.R index c037893..b559cf2 100644 --- a/R/matchAge.R +++ b/R/matchAge.R @@ -14,6 +14,7 @@ #' @export #' #' @examples +#' #' set.seed(999) #' x <- data.frame(age = rpois(5, 30), age_class = "YEAR") #' y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/R/matchDate.R b/R/matchDate.R index a3527c6..1d15c41 100644 --- a/R/matchDate.R +++ b/R/matchDate.R @@ -14,6 +14,7 @@ #' @importFrom lubridate parse_date_time #' #' @examples +#' #' #' # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. #' # Note that the way the dates can vary within data sets, but it's assumed diff --git a/R/matchGender.R b/R/matchGender.R index 2414b5d..29a1a8d 100644 --- a/R/matchGender.R +++ b/R/matchGender.R @@ -11,6 +11,7 @@ #' @export #' #' @examples +#' #' test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) #' genderDists(test) genderDists <- function(dat1, dat2=NULL, diff --git a/R/matchGeneric.R b/R/matchGeneric.R index 5cff81b..4d70b10 100644 --- a/R/matchGeneric.R +++ b/R/matchGeneric.R @@ -7,6 +7,7 @@ #' @export #' #' @examples +#' #' set.seed(9) #' x <- data.frame(dat = sample(10, replace = TRUE)) #' x$let <- letters[x$dat] diff --git a/R/matchNames.R b/R/matchNames.R index ed1a64d..a57c339 100644 --- a/R/matchNames.R +++ b/R/matchNames.R @@ -8,6 +8,7 @@ #' @export #' #' @examples +#' #' set.seed(9) #' x <- data.frame(x = letters, y = LETTERS, z = 1:26) #' x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/R/process_matching.R b/R/process_matching.R index 5b3c34c..520acea 100644 --- a/R/process_matching.R +++ b/R/process_matching.R @@ -20,6 +20,7 @@ #' @export #' #' @examples +#' #' ## Loading Data #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R new file mode 100644 index 0000000..243e7ba --- /dev/null +++ b/R/tableFromMatch.R @@ -0,0 +1,137 @@ +#' Generate a list of tables aggregating the matched values +#' +#' @inheritParams processFunctionList +#' @param matchList a list derived from \code{\link{matchEpiData}}. If this is +#' \code{NULL}, a threshold can be supplied instead to calculate the list on +#' the fly. +#' @param collapse When \code{TRUE}, the list of data frames will be collapsed +#' into one data frame an extra column specifying the group appended. +#' +#' @details This will collect all of the data from \code{\link{matchEpiData}} +#' and present it in table format. It will collapse them into tidy tables and, +#' if provided, the score of the matches will be provided. +#' @return a list of data frames sorted in decreasing order of matching-ness +#' with three extra columns: +#' \describe{ +#' \item{source}{ data set of origin} +#' \item{index}{ index in data set} +#' \item{score}{ the score of the matches to the first item in the group.} +#' } +#' When \code{collapse = TRUE}, a fourth column, \code{group} is appended. +#' +#' @export +#' +#' @examples +#' ## Loading Data +#' +#' indata <- system.file("files", package = "epimatch") +#' indata <- dir(indata, full.names = TRUE) +#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +#' names(x) <- basename(indata) +#' +#' # We will use one data set from the case information and lab results +#' case <- x[["CaseInformationForm.csv"]] +#' lab <- x[["LaboratoryResultsForm7.csv"]] +#' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) +#' # This will get all of the indices that match the ID and Names with a +#' # threshold of 0.25 +#' res <- matchEpiData(dat1 = case, +#' dat2 = lab, +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = TRUE) +#' tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) +#' tablesFromMatch(case, lab, funlist, matchList = 0.25) +tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), + matchList = NULL, collapse = TRUE){ + + # Check if incoming data are cromulent + dat2exists <- !is.null(dat2) && is.data.frame(dat2) + if (!is.data.frame(dat1) | !dat2exists){ + stop("data must be in data frame format") + } + # Check if there are any matches at all + if (is.null(matchList) || length(matchList) == 0){ + stop("no matches found!") + } + # If the matchList is a threshold parameter, pass it to the matchEpiData + # function. + if (is.numeric(matchList) && length(matchList) == 1){ + matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList, + giveWeight = TRUE) + } + # If the incoming list contains numeric values, collect the thresholds and + # convert the list to indices. + if (!is.integer(unlist(matchList))){ + theThresholds <- matchList + matchList <- getIndexList(matchList) + } + + # Grab the tested variables from dat1 + d1vars <- lapply(funlist, "[[", "d1vars") + d1 <- lapply(d1vars, function(i) dat1[i]) + # Since multiple columns can be evaluated by a single function, we are going + # to collapse these columns into one. + d1 <- data.frame(lapply(d1, collapseValues)) + + if (dat2exists){ + # Grab the tested variables from dat2 and rename them based off of dat1. + # This allows us to bind the data together a lot easier. + d2vars <- lapply(funlist, "[[", "d2vars") + d2 <- lapply(d2vars, function(i) dat2[i]) + d2 <- data.frame(lapply(d2, collapseValues)) + names(d2) <- names(d1) + } + + ## The output is a list of data frames subset in decreasing order of matching. + outlist <- vector(length = length(matchList), mode = "list") + for (i in seq(matchList)){ + indices <- matchList[[i]] + if (dat2exists){ + outdat <- rbind(d1[indices$d1, ], d2[indices$d2, ]) + } else { + outdat <- d1[indices$d1, ] + } + ## Give the indices within each data set ------------------------ + outdat$index <- unlist(indices, use.names = FALSE) + + ## Give the source of the data ---------------------------------- + outdat$source <- rep(c("d1", "d2"), lapply(indices, length)) + + ## If the incoming list are thresholds, add them as a column ---- + if (exists("theThresholds", inherits = FALSE)){ + outdat$score <- unlist(theThresholds[[i]]) + } + rownames(outdat) <- NULL + outlist[[i]] <- outdat + } + + if (collapse){ + # This is where we make the data tidy by placing everything in a single data + # frame and adding three extra columns that contain the information that was + # in the result from `matchEpiData()`. + + ## Give the indices of the highest list ------------------------- + groups <- lapply(lapply(matchList, unlist), length) + groups <- rep(seq_along(matchList), groups) + + ## Create the resulting tidy data frame and add information ----- + outlist <- do.call("rbind", outlist) + outlist$group <- groups + rownames(outlist) <- NULL + } + + return(outlist) +} diff --git a/epimatch.Rproj b/epimatch.Rproj index cba1b6b..7c89849 100644 --- a/epimatch.Rproj +++ b/epimatch.Rproj @@ -17,5 +17,6 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source +PackageInstallArgs: --no-multiarch --with-keep.source --install-tests +PackageCheckArgs: --as-cran PackageRoxygenize: rd,collate,namespace diff --git a/inst/shiny/server.R b/inst/shiny/server.R index d5fedd6..16a06f8 100644 --- a/inst/shiny/server.R +++ b/inst/shiny/server.R @@ -251,11 +251,13 @@ function(input, output, session) { hide("findMatchesLoading") }) tryCatch({ - values$results <- epimatch::matchEpiData( + theMatches <- epimatch::matchEpiData( dat1 = values$data1, dat2 = values$data2, funlist = funlist, - thresh = input$threshold + thresh = input$threshold, + giveWeight = TRUE ) + values$results <- epimatch::getIndexList(theMatches) show(selector = ".findMatchesDone") delay(1000, hide(selector = ".findMatchesDone", anim = TRUE, animType = "fade", time = 0.5)) diff --git a/man/ageDists.Rd b/man/ageDists.Rd index 039da4a..a4d4e05 100644 --- a/man/ageDists.Rd +++ b/man/ageDists.Rd @@ -27,6 +27,7 @@ This function will take one or two data frames containing ages data frame with a specifier for months or year in the second column. } \examples{ + set.seed(999) x <- data.frame(age = rpois(5, 30), age_class = "YEAR") y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/man/dateDists.Rd b/man/dateDists.Rd index f080758..2056cbc 100644 --- a/man/dateDists.Rd +++ b/man/dateDists.Rd @@ -28,6 +28,7 @@ The formats for dates are passed to \pkg{lubridate}'s function } \examples{ + # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. # Note that the way the dates can vary within data sets, but it's assumed # that the order within a data set is consistant. diff --git a/man/epimatch.Rd b/man/epimatch.Rd index 612bbe5..96d3767 100644 --- a/man/epimatch.Rd +++ b/man/epimatch.Rd @@ -37,6 +37,7 @@ Find matching patient records across tabular datasets \itemize{ \item \code{\link{processFunctionList}} \item \code{\link{matchEpiData}} + \item \code{\link{tablesFromMatch}} } } diff --git a/man/genderDists.Rd b/man/genderDists.Rd index 0405e95..dfe1204 100644 --- a/man/genderDists.Rd +++ b/man/genderDists.Rd @@ -27,6 +27,7 @@ A thin wrapper around generic, which converts some known male and female codes to binary } \examples{ + test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) genderDists(test) } diff --git a/man/genericDists.Rd b/man/genericDists.Rd index aa34061..46a4d27 100644 --- a/man/genericDists.Rd +++ b/man/genericDists.Rd @@ -18,6 +18,7 @@ a pairwise matrix of scores from 0 (exact match) to 1 (no match) generic function for exact matching single columns } \examples{ + set.seed(9) x <- data.frame(dat = sample(10, replace = TRUE)) x$let <- letters[x$dat] diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd new file mode 100644 index 0000000..a7ea751 --- /dev/null +++ b/man/getIndexList.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_self_match.R +\name{getIndexList} +\alias{getIndexList} +\title{return a list of indices from matchEpiData} +\usage{ +getIndexList(matchList) +} +\arguments{ +\item{matchList}{a list of lists of weights} +} +\value{ +an unnamed list of lists of indices +} +\description{ +The output of matchEpiData can either be a list of indices or a named list of +weights. If it's a named list of weights, this function will return the list +of indices. +} +\examples{ +## Loading Data + +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +# This will get all of the indices that match the ID and Names with a +# threshold of 0.25 +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +res # List of weights +getIndexList(res) # List of indices +} +\keyword{internal} +\keyword{utilities} + diff --git a/man/matchEpiData.Rd b/man/matchEpiData.Rd index ef94468..49dabfc 100644 --- a/man/matchEpiData.Rd +++ b/man/matchEpiData.Rd @@ -42,6 +42,7 @@ this function will take in one or two data sets, a list of functions } \examples{ ## Loading Data + indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) @@ -51,31 +52,28 @@ names(x) <- basename(indata) case <- x[["CaseInformationForm.csv"]] lab <- x[["LaboratoryResultsForm7.csv"]] +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) # This will get all of the indices that match the ID and Names with a # threshold of 0.25 res <- matchEpiData(dat1 = case, dat2 = lab, - funlist = list( - list(d1vars = "ID", - d2vars = "ID", - fun = "nameDists", - extraparams = NULL, - weight = 1), - list(d1vars = c("Surname", "OtherNames"), - d2vars = c("SurnameLab", "OtherNameLab"), - fun = "nameDists", - extraparams = NULL, - weight = 0.5) - ), - thresh = 0.25) + funlist = funlist, + thresh = 0.25, + giveWeight = FALSE) # List of indices res -# Printing out the matching names in decreasing order of matching -invisible(lapply(res, function(i) { - print(case[i$d1, c("Surname", "OtherNames")]) - print(lab[i$d2, c("SurnameLab", "OtherNameLab")]) - cat("\\n\\t--------\\n") - })) +tablesFromMatch(case, lab, funlist, matchList = res) } diff --git a/man/nameDists.Rd b/man/nameDists.Rd index 703b01d..08aa219 100644 --- a/man/nameDists.Rd +++ b/man/nameDists.Rd @@ -20,6 +20,7 @@ Clean and match names column by removing punctuation and replacing punctuation with an underscore } \examples{ + set.seed(9) x <- data.frame(x = letters, y = LETTERS, z = 1:26) x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/man/processFunctionList.Rd b/man/processFunctionList.Rd index 66011bb..3c8b097 100644 --- a/man/processFunctionList.Rd +++ b/man/processFunctionList.Rd @@ -29,6 +29,7 @@ column names, and function parameters, and return a list of matrices by function. } \examples{ + ## Loading Data indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd new file mode 100644 index 0000000..eb5b582 --- /dev/null +++ b/man/tablesFromMatch.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tableFromMatch.R +\name{tablesFromMatch} +\alias{tablesFromMatch} +\title{Generate a list of tables aggregating the matched values} +\usage{ +tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL, + collapse = TRUE) +} +\arguments{ +\item{dat1}{An input linelist} + +\item{dat2}{An optional extra linelist} + +\item{funlist}{A list containing lists containing: +\itemize{ + \item d1vars - variable names for dataset 1 + \item d2vars - variable names for dataset 2 + \item fun - function name to process on these variables + \item extraparams - extra parameters that need to be applied with the function. + \item weights - a weight vector to scale each matrix (not used in processFunctionList). +}} + +\item{matchList}{a list derived from \code{\link{matchEpiData}}. If this is +\code{NULL}, a threshold can be supplied instead to calculate the list on +the fly.} + +\item{collapse}{When \code{TRUE}, the list of data frames will be collapsed +into one data frame an extra column specifying the group appended.} +} +\value{ +a list of data frames sorted in decreasing order of matching-ness + with three extra columns: + \describe{ + \item{source}{ data set of origin} + \item{index}{ index in data set} + \item{score}{ the score of the matches to the first item in the group.} + } + When \code{collapse = TRUE}, a fourth column, \code{group} is appended. +} +\description{ +Generate a list of tables aggregating the matched values +} +\details{ +This will collect all of the data from \code{\link{matchEpiData}} + and present it in table format. It will collapse them into tidy tables and, + if provided, the score of the matches will be provided. +} +\examples{ +## Loading Data + +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +# This will get all of the indices that match the ID and Names with a +# threshold of 0.25 +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) +tablesFromMatch(case, lab, funlist, matchList = 0.25) +} + diff --git a/tests/testthat/test-export.R b/tests/testthat/test-export.R new file mode 100644 index 0000000..37b1b36 --- /dev/null +++ b/tests/testthat/test-export.R @@ -0,0 +1,55 @@ +context("tablesFromMatch tests") + +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +l_res <- tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) +t_res <- tablesFromMatch(case, lab, funlist, matchList = 0.25) + +## This not only tests the conversion, but since the input for the tests above +## are the same, this also tests to ensure that the matchList argument can take +## the result of matchEpiData AND a threshold. +test_that("list output and collapsed output are the same", { + expect_equal(sum(vapply(l_res, nrow, 1L)), nrow(t_res)) + expect_equal(do.call("rbind", l_res), t_res[names(l_res[[1]])]) +}) + +test_that("tablesFromMatch returns a list when collapse = FALSE", { + expect_is(l_res, "list") + for (i in seq_along(l_res)){ + dat <- l_res[[i]] + expect_is(dat, "data.frame", info = paste("group", i)) + expect_equal(ncol(dat), ncol(l_res[[i]])) + } +}) + +test_that("the data frame is tidy", { + expect_is(t_res, "data.frame") + expect_identical(names(t_res)[length(t_res)], "group") + expect_equal(length(unique(t_res$group)), length(l_res)) + expect_equal(nrow(t_res), 20L) + expect_equal(ncol(t_res), 6L) +})