From b6e49744b9c27034dc54bb892be5a74742c08989 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 16 Jul 2016 16:38:53 -0700 Subject: [PATCH 01/16] Create internal function for weight/index switch --- R/epi_self_match.R | 28 +++++++++++------------ R/internal.R | 49 ++++++++++++++++++++++++++++++++++++++++ man/getIndexList.Rd | 54 +++++++++++++++++++++++++++++++++++++++++++++ man/matchEpiData.Rd | 25 +++++++++++---------- 4 files changed, 130 insertions(+), 26 deletions(-) create mode 100644 man/getIndexList.Rd diff --git a/R/epi_self_match.R b/R/epi_self_match.R index f3b57c4..6fb57be 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -25,22 +25,23 @@ #' case <- x[["CaseInformationForm.csv"]] #' lab <- x[["LaboratoryResultsForm7.csv"]] #' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) #' # This will get all of the indices that match the ID and Names with a #' # threshold of 0.25 #' res <- matchEpiData(dat1 = case, #' dat2 = lab, -#' funlist = list( -#' list(d1vars = "ID", -#' d2vars = "ID", -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 1), -#' list(d1vars = c("Surname", "OtherNames"), -#' d2vars = c("SurnameLab", "OtherNameLab"), -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 0.5) -#' ), +#' funlist = funlist, #' thresh = 0.25) #' # List of indices #' res @@ -59,8 +60,7 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv # For processing in the shiny app, it's useful to display either the weights # as a named index vector or the indices themselves if (!giveWeight){ - names(out) <- NULL - out <- lapply(out, lapply, function(i) as.integer(names(i))) + out <- getIndexList(out) } return(out) } diff --git a/R/internal.R b/R/internal.R index 89fbcbc..f62af2f 100644 --- a/R/internal.R +++ b/R/internal.R @@ -66,3 +66,52 @@ splitComma <- function(dat) } return(new_dat) } + + +#' return a list of indices from matchEpiData +#' +#' The output of matchEpiData can either be a list of indices or a named list of +#' weights. If it's a named list of weights, this function will return the list +#' of indices. +#' +#' @param matchList a list of lists of weights. +#' +#' @return an unnamed list of lists of indices +#' @keywords internal +#' @examples +#' ## Loading Data +#' indata <- system.file("files", package = "epimatch") +#' indata <- dir(indata, full.names = TRUE) +#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +#' names(x) <- basename(indata) +#' +#' # We will use one data set from the case information and lab results +#' case <- x[["CaseInformationForm.csv"]] +#' lab <- x[["LaboratoryResultsForm7.csv"]] +#' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) +#' # This will get all of the indices that match the ID and Names with a +#' # threshold of 0.25 +#' res <- matchEpiData(dat1 = case, +#' dat2 = lab, +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = TRUE) +#' res # List of weights +#' getIndexList(res) # List of indices +getIndexList <- function(matchList){ + out <- lapply(matchList, lapply, function(i) as.integer(names(i))) + names(out) <- NULL + return(out) +} diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd new file mode 100644 index 0000000..a473a45 --- /dev/null +++ b/man/getIndexList.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal.R +\name{getIndexList} +\alias{getIndexList} +\title{return a list of indices from matchEpiData} +\usage{ +getIndexList(matchList) +} +\arguments{ +\item{matchList}{a list of lists of weights.} +} +\value{ +an unnamed list of lists of indices +} +\description{ +The output of matchEpiData can either be a list of indices or a named list of +weights. If it's a named list of weights, this function will return the list +of indices. +} +\examples{ +## Loading Data +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +# This will get all of the indices that match the ID and Names with a +# threshold of 0.25 +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +res # List of weights +getIndexList(res) # List of indices +} +\keyword{internal} + diff --git a/man/matchEpiData.Rd b/man/matchEpiData.Rd index ef94468..92bde0d 100644 --- a/man/matchEpiData.Rd +++ b/man/matchEpiData.Rd @@ -51,22 +51,23 @@ names(x) <- basename(indata) case <- x[["CaseInformationForm.csv"]] lab <- x[["LaboratoryResultsForm7.csv"]] +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) # This will get all of the indices that match the ID and Names with a # threshold of 0.25 res <- matchEpiData(dat1 = case, dat2 = lab, - funlist = list( - list(d1vars = "ID", - d2vars = "ID", - fun = "nameDists", - extraparams = NULL, - weight = 1), - list(d1vars = c("Surname", "OtherNames"), - d2vars = c("SurnameLab", "OtherNameLab"), - fun = "nameDists", - extraparams = NULL, - weight = 0.5) - ), + funlist = funlist, thresh = 0.25) # List of indices res From 43f4aeed4a0bb7ab5a878a626a9e3ee8d1358a9a Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 16 Jul 2016 17:47:02 -0700 Subject: [PATCH 02/16] Add collapseValues internal function --- R/internal.R | 25 +++++++++++++++++++++ man/getIndexList.Rd | 54 --------------------------------------------- 2 files changed, 25 insertions(+), 54 deletions(-) delete mode 100644 man/getIndexList.Rd diff --git a/R/internal.R b/R/internal.R index f62af2f..0a5a15e 100644 --- a/R/internal.R +++ b/R/internal.R @@ -78,6 +78,7 @@ splitComma <- function(dat) #' #' @return an unnamed list of lists of indices #' @keywords internal +#' @noRd #' @examples #' ## Loading Data #' indata <- system.file("files", package = "epimatch") @@ -115,3 +116,27 @@ getIndexList <- function(matchList){ names(out) <- NULL return(out) } + +#' Collapse columns of a data frame into one +#' +#' @param df a data frame with one or more columns +#' @param sep a separator to collapse the columns with +#' +#' @return a single column data frame +#' @keywords internal +#' @noRd +#' +#' @examples +#' dat <- data.frame(a = letters, b = sample(100, 26), c = LETTERS) +#' collapseValues(dat) +collapseValues <- function(df, sep = "_"){ + if (!is.data.frame(df)){ + stop("a data frame is needed") + } + if (length(df) > 1){ + dfnames <- paste(names(df), collapse = sep) + df <- data.frame(apply(df, 1, paste, collapse = sep), stringsAsFactors = FALSE) + names(df) <- dfnames + } + return(df) +} diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd deleted file mode 100644 index a473a45..0000000 --- a/man/getIndexList.Rd +++ /dev/null @@ -1,54 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/internal.R -\name{getIndexList} -\alias{getIndexList} -\title{return a list of indices from matchEpiData} -\usage{ -getIndexList(matchList) -} -\arguments{ -\item{matchList}{a list of lists of weights.} -} -\value{ -an unnamed list of lists of indices -} -\description{ -The output of matchEpiData can either be a list of indices or a named list of -weights. If it's a named list of weights, this function will return the list -of indices. -} -\examples{ -## Loading Data -indata <- system.file("files", package = "epimatch") -indata <- dir(indata, full.names = TRUE) -x <- lapply(indata, read.csv, stringsAsFactors = FALSE) -names(x) <- basename(indata) - -# We will use one data set from the case information and lab results -case <- x[["CaseInformationForm.csv"]] -lab <- x[["LaboratoryResultsForm7.csv"]] - -funlist <- list( - list(d1vars = "ID", - d2vars = "ID", - fun = "nameDists", - extraparams = NULL, - weight = 1), - list(d1vars = c("Surname", "OtherNames"), - d2vars = c("SurnameLab", "OtherNameLab"), - fun = "nameDists", - extraparams = NULL, - weight = 0.5) - ) -# This will get all of the indices that match the ID and Names with a -# threshold of 0.25 -res <- matchEpiData(dat1 = case, - dat2 = lab, - funlist = funlist, - thresh = 0.25, - giveWeight = TRUE) -res # List of weights -getIndexList(res) # List of indices -} -\keyword{internal} - From 74cfc087a5c77f9a7de32872af71c88d85da1d83 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 16 Jul 2016 17:48:55 -0700 Subject: [PATCH 03/16] Add tableFromMatch function I've added a function to return tables from the match query. It can work in addition or in lieu of `matchEpiData()`. I have added documentation for this and also updated the documentation of `matchEpiData()`. --- NAMESPACE | 1 + R/epi_self_match.R | 10 ++--- R/tableFromMatch.R | 88 ++++++++++++++++++++++++++++++++++++++++++ man/matchEpiData.Rd | 10 ++--- man/tablesFromMatch.Rd | 66 +++++++++++++++++++++++++++++++ 5 files changed, 161 insertions(+), 14 deletions(-) create mode 100644 R/tableFromMatch.R create mode 100644 man/tablesFromMatch.Rd diff --git a/NAMESPACE b/NAMESPACE index c03d96a..3e81f22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(matchEpiData) export(nameDists) export(processFunctionList) export(returnMatches) +export(tablesFromMatch) importFrom(lubridate,parse_date_time) importFrom(shiny,runApp) importFrom(stats,dist) diff --git a/R/epi_self_match.R b/R/epi_self_match.R index 6fb57be..99c36a7 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -42,16 +42,12 @@ #' res <- matchEpiData(dat1 = case, #' dat2 = lab, #' funlist = funlist, -#' thresh = 0.25) +#' thresh = 0.25, +#' giveWeight = FALSE) #' # List of indices #' res #' -#' # Printing out the matching names in decreasing order of matching -#' invisible(lapply(res, function(i) { -#' print(case[i$d1, c("Surname", "OtherNames")]) -#' print(lab[i$d2, c("SurnameLab", "OtherNameLab")]) -#' cat("\n\t--------\n") -#' })) +#' tablesFromMatch(case, lab, funlist, matchList = res) matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giveWeight = FALSE){ the_matrices <- processFunctionList(dat1, dat2, funlist) the_weights <- unlist(lapply(funlist, function(i) i$weight)) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R new file mode 100644 index 0000000..a6adcc9 --- /dev/null +++ b/R/tableFromMatch.R @@ -0,0 +1,88 @@ +#' Generate a list of tables aggregating the matched values +#' +#' @inheritParams processFunctionList +#' @param matchList a list derived from \code{\link{matchEpiData}}. If this is +#' \code{NULL}, a threshold can be supplied instead to calculate the list on +#' the fly. +#' +#' @return a list of data frames sorted in decreasing order of matching-ness. +#' @export +#' +#' @examples +#' ## Loading Data +#' indata <- system.file("files", package = "epimatch") +#' indata <- dir(indata, full.names = TRUE) +#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +#' names(x) <- basename(indata) +#' +#' # We will use one data set from the case information and lab results +#' case <- x[["CaseInformationForm.csv"]] +#' lab <- x[["LaboratoryResultsForm7.csv"]] +#' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) +#' # This will get all of the indices that match the ID and Names with a +#' # threshold of 0.25 +#' res <- matchEpiData(dat1 = case, +#' dat2 = lab, +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = TRUE) +#' tablesFromMatch(case, lab, funlist, matchList = res) +#' tablesFromMatch(case, lab, funlist, matchList = 0.25) +tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), matchList = NULL){ + + # Check if incoming data are cromulent + dat2exists <- !is.null(dat2) && is.data.frame(dat2) + if (!is.data.frame(dat1) | !dat2exists){ + stop("data must be in data frame format") + } + # Check if there are any matches at all + if (is.null(matchList) || length(matchList) == 0){ + stop("no matches found!") + } + # If the matchList is a threshold parameter, pass it to the matchEpiData function. + if (is.numeric(matchList) && length(matchList) == 1){ + matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList) + } else if (!is.integer(unlist(matchList))){ + matchList <- getIndexList(matchList) + } + + # Grab the tested variables from dat1 + d1vars <- lapply(funlist, "[[", "d1vars") + d1 <- lapply(d1vars, function(i) dat1[i]) + # Since multiple columns can be evaluated by a single function, we are going + # to collapse these columns into one. + d1 <- data.frame(lapply(d1, collapseValues)) + + if (dat2exists){ + # Grab the tested variables from dat2 and rename them based off of dat1. + # This allows us to bind the data together a lot easier. + d2vars <- lapply(funlist, "[[", "d2vars") + d2 <- lapply(d2vars, function(i) dat2[i]) + d2 <- data.frame(lapply(d2, collapseValues)) + names(d2) <- names(d1) + } + + # The output is a list of data frames subset in decreasing order of matching. + outlist <- vector(length = length(matchList), mode = "list") + for (i in seq(matchList)){ + indices <- matchList[[i]] + if (dat2exists){ + outlist[[i]] <- rbind(d1[indices$d1, ], d2[indices$d2, ]) + } else { + outlist[[i]] <- d1[indices$d1, ] + } + } + return(outlist) +} diff --git a/man/matchEpiData.Rd b/man/matchEpiData.Rd index 92bde0d..9f6d911 100644 --- a/man/matchEpiData.Rd +++ b/man/matchEpiData.Rd @@ -68,15 +68,11 @@ funlist <- list( res <- matchEpiData(dat1 = case, dat2 = lab, funlist = funlist, - thresh = 0.25) + thresh = 0.25, + giveWeight = FALSE) # List of indices res -# Printing out the matching names in decreasing order of matching -invisible(lapply(res, function(i) { - print(case[i$d1, c("Surname", "OtherNames")]) - print(lab[i$d2, c("SurnameLab", "OtherNameLab")]) - cat("\\n\\t--------\\n") - })) +tablesFromMatch(case, lab, funlist, matchList = res) } diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd new file mode 100644 index 0000000..b076e24 --- /dev/null +++ b/man/tablesFromMatch.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tableFromMatch.R +\name{tablesFromMatch} +\alias{tablesFromMatch} +\title{Generate a list of tables aggregating the matched values} +\usage{ +tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL) +} +\arguments{ +\item{dat1}{An input linelist} + +\item{dat2}{An optional extra linelist} + +\item{funlist}{A list containing lists containing: +\itemize{ + \item d1vars - variable names for dataset 1 + \item d2vars - variable names for dataset 2 + \item fun - function name to process on these variables + \item extraparams - extra parameters that need to be applied with the function. + \item weights - a weight vector to scale each matrix (not used in processFunctionList). +}} + +\item{matchList}{a list derived from \code{\link{matchEpiData}}. If this is +\code{NULL}, a threshold can be supplied instead to calculate the list on +the fly.} +} +\value{ +a list of data frames sorted in decreasing order of matching-ness. +} +\description{ +Generate a list of tables aggregating the matched values +} +\examples{ +## Loading Data +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +# This will get all of the indices that match the ID and Names with a +# threshold of 0.25 +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +tablesFromMatch(case, lab, funlist, matchList = res) +tablesFromMatch(case, lab, funlist, matchList = 0.25) +} + From 89f900e6ceb52b8f030dfdc0ce23daef8bcbb623 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 14:09:54 -0400 Subject: [PATCH 04/16] update docs to include library call --- R/epi_self_match.R | 1 + R/matchAge.R | 1 + R/matchDate.R | 1 + R/matchGender.R | 1 + R/matchGeneric.R | 1 + R/matchNames.R | 1 + R/process_matching.R | 1 + R/tableFromMatch.R | 1 + man/ageDists.Rd | 1 + man/dateDists.Rd | 1 + man/genderDists.Rd | 1 + man/genericDists.Rd | 1 + man/matchEpiData.Rd | 1 + man/nameDists.Rd | 1 + man/processFunctionList.Rd | 1 + man/tablesFromMatch.Rd | 1 + 16 files changed, 16 insertions(+) diff --git a/R/epi_self_match.R b/R/epi_self_match.R index 99c36a7..f0cffa0 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -16,6 +16,7 @@ #' #' @examples #' ## Loading Data +#' library('epimatch') #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/R/matchAge.R b/R/matchAge.R index c037893..f15f357 100644 --- a/R/matchAge.R +++ b/R/matchAge.R @@ -14,6 +14,7 @@ #' @export #' #' @examples +#' library('epimatch') #' set.seed(999) #' x <- data.frame(age = rpois(5, 30), age_class = "YEAR") #' y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/R/matchDate.R b/R/matchDate.R index a3527c6..860dbd8 100644 --- a/R/matchDate.R +++ b/R/matchDate.R @@ -14,6 +14,7 @@ #' @importFrom lubridate parse_date_time #' #' @examples +#' library('epimatch') #' #' # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. #' # Note that the way the dates can vary within data sets, but it's assumed diff --git a/R/matchGender.R b/R/matchGender.R index 2414b5d..fa6d17c 100644 --- a/R/matchGender.R +++ b/R/matchGender.R @@ -11,6 +11,7 @@ #' @export #' #' @examples +#' library('epimatch') #' test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) #' genderDists(test) genderDists <- function(dat1, dat2=NULL, diff --git a/R/matchGeneric.R b/R/matchGeneric.R index 5cff81b..025e045 100644 --- a/R/matchGeneric.R +++ b/R/matchGeneric.R @@ -7,6 +7,7 @@ #' @export #' #' @examples +#' library('epimatch') #' set.seed(9) #' x <- data.frame(dat = sample(10, replace = TRUE)) #' x$let <- letters[x$dat] diff --git a/R/matchNames.R b/R/matchNames.R index ed1a64d..a7f1314 100644 --- a/R/matchNames.R +++ b/R/matchNames.R @@ -8,6 +8,7 @@ #' @export #' #' @examples +#' library('epimatch') #' set.seed(9) #' x <- data.frame(x = letters, y = LETTERS, z = 1:26) #' x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/R/process_matching.R b/R/process_matching.R index 5b3c34c..6cb4806 100644 --- a/R/process_matching.R +++ b/R/process_matching.R @@ -20,6 +20,7 @@ #' @export #' #' @examples +#' library('epimatch') #' ## Loading Data #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index a6adcc9..607b714 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -10,6 +10,7 @@ #' #' @examples #' ## Loading Data +#' library('epimatch') #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/man/ageDists.Rd b/man/ageDists.Rd index 039da4a..6c9418f 100644 --- a/man/ageDists.Rd +++ b/man/ageDists.Rd @@ -27,6 +27,7 @@ This function will take one or two data frames containing ages data frame with a specifier for months or year in the second column. } \examples{ +library('epimatch') set.seed(999) x <- data.frame(age = rpois(5, 30), age_class = "YEAR") y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/man/dateDists.Rd b/man/dateDists.Rd index f080758..6857dbc 100644 --- a/man/dateDists.Rd +++ b/man/dateDists.Rd @@ -27,6 +27,7 @@ The formats for dates are passed to \pkg{lubridate}'s function "ymd", etc. } \examples{ +library('epimatch') # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. # Note that the way the dates can vary within data sets, but it's assumed diff --git a/man/genderDists.Rd b/man/genderDists.Rd index 0405e95..60e5c38 100644 --- a/man/genderDists.Rd +++ b/man/genderDists.Rd @@ -27,6 +27,7 @@ A thin wrapper around generic, which converts some known male and female codes to binary } \examples{ +library('epimatch') test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) genderDists(test) } diff --git a/man/genericDists.Rd b/man/genericDists.Rd index aa34061..a41b95a 100644 --- a/man/genericDists.Rd +++ b/man/genericDists.Rd @@ -18,6 +18,7 @@ a pairwise matrix of scores from 0 (exact match) to 1 (no match) generic function for exact matching single columns } \examples{ +library('epimatch') set.seed(9) x <- data.frame(dat = sample(10, replace = TRUE)) x$let <- letters[x$dat] diff --git a/man/matchEpiData.Rd b/man/matchEpiData.Rd index 9f6d911..f3e2a4f 100644 --- a/man/matchEpiData.Rd +++ b/man/matchEpiData.Rd @@ -42,6 +42,7 @@ this function will take in one or two data sets, a list of functions } \examples{ ## Loading Data +library('epimatch') indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/man/nameDists.Rd b/man/nameDists.Rd index 703b01d..c79dd12 100644 --- a/man/nameDists.Rd +++ b/man/nameDists.Rd @@ -20,6 +20,7 @@ Clean and match names column by removing punctuation and replacing punctuation with an underscore } \examples{ +library('epimatch') set.seed(9) x <- data.frame(x = letters, y = LETTERS, z = 1:26) x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/man/processFunctionList.Rd b/man/processFunctionList.Rd index 66011bb..f213384 100644 --- a/man/processFunctionList.Rd +++ b/man/processFunctionList.Rd @@ -29,6 +29,7 @@ column names, and function parameters, and return a list of matrices by function. } \examples{ +library('epimatch') ## Loading Data indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index b076e24..f8f7605 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -32,6 +32,7 @@ Generate a list of tables aggregating the matched values } \examples{ ## Loading Data +library('epimatch') indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) From fea14c432e51dbe2e44f3f44867de2877ed23a16 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 15:41:12 -0400 Subject: [PATCH 05/16] add collapse argument to output single data frame --- R/tableFromMatch.R | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index 607b714..dbeb062 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -4,6 +4,13 @@ #' @param matchList a list derived from \code{\link{matchEpiData}}. If this is #' \code{NULL}, a threshold can be supplied instead to calculate the list on #' the fly. +#' @param collapse When \code{TRUE}, the list of data frames will be collapsed +#' into one data frame with three extra columns specifying: +#' \itemize{ +#' \item match group +#' \item data set +#' \item index in data set +#' } #' #' @return a list of data frames sorted in decreasing order of matching-ness. #' @export @@ -41,7 +48,8 @@ #' giveWeight = TRUE) #' tablesFromMatch(case, lab, funlist, matchList = res) #' tablesFromMatch(case, lab, funlist, matchList = 0.25) -tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), matchList = NULL){ +tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), + matchList = NULL, collapse = TRUE){ # Check if incoming data are cromulent dat2exists <- !is.null(dat2) && is.data.frame(dat2) @@ -85,5 +93,19 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), matchList = NUL outlist[[i]] <- d1[indices$d1, ] } } + + if (collapse){ + groups <- lapply(lapply(matchList, unlist), length) + groups <- rep(seq_along(matchList), groups) + indices <- unlist(matchList) + data_length <- unlist(lapply(matchList, lapply, length)) + data_source <- rep(c("d1", "d2"), length(matchList)) + data_source <- rep(data_source, data_length) + outlist <- do.call("rbind", outlist) + outlist$groups <- groups + outlist$dataset <- data_source + outlist$index <- indices + } + return(outlist) } From 451a6c3f543263573fb43e712d0fe540df1a60c0 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 15:43:14 -0400 Subject: [PATCH 06/16] update documentation --- R/epimatch-package.r | 1 + man/epimatch.Rd | 1 + man/tablesFromMatch.Rd | 11 ++++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/R/epimatch-package.r b/R/epimatch-package.r index 2bc77cb..ec75319 100644 --- a/R/epimatch-package.r +++ b/R/epimatch-package.r @@ -25,6 +25,7 @@ #' \itemize{ #' \item \code{\link{processFunctionList}} #' \item \code{\link{matchEpiData}} +#' \item \code{\link{tablesFromMatch}} #' } #' #' @section Dissimilarity Functions: Each dissimilarity function returns a diff --git a/man/epimatch.Rd b/man/epimatch.Rd index 612bbe5..96d3767 100644 --- a/man/epimatch.Rd +++ b/man/epimatch.Rd @@ -37,6 +37,7 @@ Find matching patient records across tabular datasets \itemize{ \item \code{\link{processFunctionList}} \item \code{\link{matchEpiData}} + \item \code{\link{tablesFromMatch}} } } diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index f8f7605..2634c90 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -4,7 +4,8 @@ \alias{tablesFromMatch} \title{Generate a list of tables aggregating the matched values} \usage{ -tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL) +tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL, + collapse = TRUE) } \arguments{ \item{dat1}{An input linelist} @@ -23,6 +24,14 @@ tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL) \item{matchList}{a list derived from \code{\link{matchEpiData}}. If this is \code{NULL}, a threshold can be supplied instead to calculate the list on the fly.} + +\item{collapse}{When \code{TRUE}, the list of data frames will be collapsed +into one data frame with three extra columns specifying: +\itemize{ + \item match group + \item data set + \item index in data set +}} } \value{ a list of data frames sorted in decreasing order of matching-ness. From e06555b591f684236d6c97c86b586d564dc113ab Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 15:48:56 -0400 Subject: [PATCH 07/16] add notes to future self and others --- R/tableFromMatch.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index dbeb062..41d255a 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -95,12 +95,23 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), } if (collapse){ + # This is where we make the data tidy by placing everything in a single data + # frame and adding three extra columns that contain the information that was + # in the result from `matchEpiData()`. + + ## Give the indices of the highest list ------------------------- groups <- lapply(lapply(matchList, unlist), length) groups <- rep(seq_along(matchList), groups) - indices <- unlist(matchList) + + ## Give the source of the data ---------------------------------- data_length <- unlist(lapply(matchList, lapply, length)) data_source <- rep(c("d1", "d2"), length(matchList)) data_source <- rep(data_source, data_length) + + ## Give the indices within each data set ------------------------ + indices <- unlist(matchList) + + ## Create the resulting tidy data frame and add information ----- outlist <- do.call("rbind", outlist) outlist$groups <- groups outlist$dataset <- data_source From fff6688c00f9341b487ef272d8a582ec8cfdc904 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 17:03:43 -0400 Subject: [PATCH 08/16] Add argument to return thresholds in tidy data --- R/tableFromMatch.R | 35 +++++++++++++++++++++++++---------- man/tablesFromMatch.Rd | 18 ++++++++++++------ 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index 41d255a..2f95679 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -5,13 +5,17 @@ #' \code{NULL}, a threshold can be supplied instead to calculate the list on #' the fly. #' @param collapse When \code{TRUE}, the list of data frames will be collapsed -#' into one data frame with three extra columns specifying: -#' \itemize{ -#' \item match group -#' \item data set -#' \item index in data set +#' into one data frame with three or four extra columns specifying: +#' \describe{ +#' \item{groups}{ match group in decreasing order of score} +#' \item{dataset}{ data set of origin} +#' \item{index}{ index in data set} +#' \item{score}{ the score of the matches to the first item in the group.} #' } #' +#' @details This will collect all of the data from \code{\link{matchEpiData}} +#' and present it in table format. It will collapse them into tidy tables and, +#' if provided, the score of the matches will be provided. #' @return a list of data frames sorted in decreasing order of matching-ness. #' @export #' @@ -46,7 +50,7 @@ #' funlist = funlist, #' thresh = 0.25, #' giveWeight = TRUE) -#' tablesFromMatch(case, lab, funlist, matchList = res) +#' tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) #' tablesFromMatch(case, lab, funlist, matchList = 0.25) tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), matchList = NULL, collapse = TRUE){ @@ -60,11 +64,17 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), if (is.null(matchList) || length(matchList) == 0){ stop("no matches found!") } - # If the matchList is a threshold parameter, pass it to the matchEpiData function. + # If the matchList is a threshold parameter, pass it to the matchEpiData + # function. if (is.numeric(matchList) && length(matchList) == 1){ - matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList) - } else if (!is.integer(unlist(matchList))){ - matchList <- getIndexList(matchList) + matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList, + giveWeight = TRUE) + } + # If the incoming list contains numeric values, collect the thresholds and + # convert the list to indices. + if (!is.integer(unlist(matchList))){ + theThresholds <- unlist(matchList) + matchList <- getIndexList(matchList) } # Grab the tested variables from dat1 @@ -116,6 +126,11 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), outlist$groups <- groups outlist$dataset <- data_source outlist$index <- indices + + ## If the incoming list are thresholds, add them as a column ---- + if (exists("theThresholds", inherits = FALSE)){ + outlist$score <- theThresholds + } } return(outlist) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index 2634c90..f35eca4 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -26,11 +26,12 @@ tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL, the fly.} \item{collapse}{When \code{TRUE}, the list of data frames will be collapsed -into one data frame with three extra columns specifying: -\itemize{ - \item match group - \item data set - \item index in data set +into one data frame with three or four extra columns specifying: +\describe{ + \item{groups}{ match group in decreasing order of score} + \item{dataset}{ data set of origin} + \item{index}{ index in data set} + \item{score}{ the score of the matches to the first item in the group.} }} } \value{ @@ -39,6 +40,11 @@ a list of data frames sorted in decreasing order of matching-ness. \description{ Generate a list of tables aggregating the matched values } +\details{ +This will collect all of the data from \code{\link{matchEpiData}} + and present it in table format. It will collapse them into tidy tables and, + if provided, the score of the matches will be provided. +} \examples{ ## Loading Data library('epimatch') @@ -70,7 +76,7 @@ res <- matchEpiData(dat1 = case, funlist = funlist, thresh = 0.25, giveWeight = TRUE) -tablesFromMatch(case, lab, funlist, matchList = res) +tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) tablesFromMatch(case, lab, funlist, matchList = 0.25) } From 9061ea37bb80729e670298f0b3e530dac1561156 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 17:08:01 -0400 Subject: [PATCH 09/16] make getIndexList a user-facing function --- R/epi_self_match.R | 49 +++++++++++++++++++++++++++++++++++++++ R/internal.R | 50 ---------------------------------------- man/getIndexList.Rd | 56 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 50 deletions(-) create mode 100644 man/getIndexList.Rd diff --git a/R/epi_self_match.R b/R/epi_self_match.R index f0cffa0..008cebb 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -61,3 +61,52 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv } return(out) } + +#' return a list of indices from matchEpiData +#' +#' The output of matchEpiData can either be a list of indices or a named list of +#' weights. If it's a named list of weights, this function will return the list +#' of indices. +#' +#' @param matchList a list of lists of weights +#' +#' @return an unnamed list of lists of indices +#' @keywords internal utilities +#' @examples +#' ## Loading Data +#' library('epimatch') +#' indata <- system.file("files", package = "epimatch") +#' indata <- dir(indata, full.names = TRUE) +#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +#' names(x) <- basename(indata) +#' +#' # We will use one data set from the case information and lab results +#' case <- x[["CaseInformationForm.csv"]] +#' lab <- x[["LaboratoryResultsForm7.csv"]] +#' +#' funlist <- list( +#' list(d1vars = "ID", +#' d2vars = "ID", +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 1), +#' list(d1vars = c("Surname", "OtherNames"), +#' d2vars = c("SurnameLab", "OtherNameLab"), +#' fun = "nameDists", +#' extraparams = NULL, +#' weight = 0.5) +#' ) +#' # This will get all of the indices that match the ID and Names with a +#' # threshold of 0.25 +#' res <- matchEpiData(dat1 = case, +#' dat2 = lab, +#' funlist = funlist, +#' thresh = 0.25, +#' giveWeight = TRUE) +#' res # List of weights +#' getIndexList(res) # List of indices +getIndexList <- function(matchList){ + out <- lapply(matchList, lapply, function(i) as.integer(names(i))) + names(out) <- NULL + return(out) +} diff --git a/R/internal.R b/R/internal.R index 0a5a15e..3e7b62b 100644 --- a/R/internal.R +++ b/R/internal.R @@ -67,56 +67,6 @@ splitComma <- function(dat) return(new_dat) } - -#' return a list of indices from matchEpiData -#' -#' The output of matchEpiData can either be a list of indices or a named list of -#' weights. If it's a named list of weights, this function will return the list -#' of indices. -#' -#' @param matchList a list of lists of weights. -#' -#' @return an unnamed list of lists of indices -#' @keywords internal -#' @noRd -#' @examples -#' ## Loading Data -#' indata <- system.file("files", package = "epimatch") -#' indata <- dir(indata, full.names = TRUE) -#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) -#' names(x) <- basename(indata) -#' -#' # We will use one data set from the case information and lab results -#' case <- x[["CaseInformationForm.csv"]] -#' lab <- x[["LaboratoryResultsForm7.csv"]] -#' -#' funlist <- list( -#' list(d1vars = "ID", -#' d2vars = "ID", -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 1), -#' list(d1vars = c("Surname", "OtherNames"), -#' d2vars = c("SurnameLab", "OtherNameLab"), -#' fun = "nameDists", -#' extraparams = NULL, -#' weight = 0.5) -#' ) -#' # This will get all of the indices that match the ID and Names with a -#' # threshold of 0.25 -#' res <- matchEpiData(dat1 = case, -#' dat2 = lab, -#' funlist = funlist, -#' thresh = 0.25, -#' giveWeight = TRUE) -#' res # List of weights -#' getIndexList(res) # List of indices -getIndexList <- function(matchList){ - out <- lapply(matchList, lapply, function(i) as.integer(names(i))) - names(out) <- NULL - return(out) -} - #' Collapse columns of a data frame into one #' #' @param df a data frame with one or more columns diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd new file mode 100644 index 0000000..f3a9dfe --- /dev/null +++ b/man/getIndexList.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_self_match.R +\name{getIndexList} +\alias{getIndexList} +\title{return a list of indices from matchEpiData} +\usage{ +getIndexList(matchList) +} +\arguments{ +\item{matchList}{a list of lists of weights} +} +\value{ +an unnamed list of lists of indices +} +\description{ +The output of matchEpiData can either be a list of indices or a named list of +weights. If it's a named list of weights, this function will return the list +of indices. +} +\examples{ +## Loading Data +library('epimatch') +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +# This will get all of the indices that match the ID and Names with a +# threshold of 0.25 +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +res # List of weights +getIndexList(res) # List of indices +} +\keyword{internal,} +\keyword{utilities} + From 311647fbfe0da4f1565604a7e55092bd9219b547 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 17:13:04 -0400 Subject: [PATCH 10/16] export getIndexList --- NAMESPACE | 1 + R/epi_self_match.R | 1 + man/getIndexList.Rd | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 3e81f22..3a4c84a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(dateDists) export(distFuns) export(genderDists) export(genericDists) +export(getIndexList) export(launch) export(locationDists) export(matchEpiData) diff --git a/R/epi_self_match.R b/R/epi_self_match.R index 008cebb..c86b17f 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -72,6 +72,7 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv #' #' @return an unnamed list of lists of indices #' @keywords internal utilities +#' @export #' @examples #' ## Loading Data #' library('epimatch') diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd index f3a9dfe..20c708f 100644 --- a/man/getIndexList.Rd +++ b/man/getIndexList.Rd @@ -51,6 +51,6 @@ res <- matchEpiData(dat1 = case, res # List of weights getIndexList(res) # List of indices } -\keyword{internal,} +\keyword{internal} \keyword{utilities} From d07d8ca347f06c5fc8e25582123a016becf2ea86 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Thu, 4 Aug 2016 17:17:19 -0400 Subject: [PATCH 11/16] update shiny server to conserve the weights This will allow future development so that we can give the user the scores/weights in the table that is output. --- inst/shiny/server.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inst/shiny/server.R b/inst/shiny/server.R index d5fedd6..16a06f8 100644 --- a/inst/shiny/server.R +++ b/inst/shiny/server.R @@ -251,11 +251,13 @@ function(input, output, session) { hide("findMatchesLoading") }) tryCatch({ - values$results <- epimatch::matchEpiData( + theMatches <- epimatch::matchEpiData( dat1 = values$data1, dat2 = values$data2, funlist = funlist, - thresh = input$threshold + thresh = input$threshold, + giveWeight = TRUE ) + values$results <- epimatch::getIndexList(theMatches) show(selector = ".findMatchesDone") delay(1000, hide(selector = ".findMatchesDone", anim = TRUE, animType = "fade", time = 0.5)) From 6f2da4b5d4b92c13fd7ee45867384a09e24bb9c4 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 6 Aug 2016 09:28:45 -0700 Subject: [PATCH 12/16] Remove library calls from functions --- R/epi_self_match.R | 4 ++-- R/matchAge.R | 2 +- R/matchDate.R | 2 +- R/matchGender.R | 2 +- R/matchGeneric.R | 2 +- R/matchNames.R | 2 +- R/process_matching.R | 2 +- R/tableFromMatch.R | 2 +- man/ageDists.Rd | 2 +- man/dateDists.Rd | 2 +- man/genderDists.Rd | 2 +- man/genericDists.Rd | 2 +- man/getIndexList.Rd | 2 +- man/matchEpiData.Rd | 2 +- man/nameDists.Rd | 2 +- man/processFunctionList.Rd | 2 +- man/tablesFromMatch.Rd | 2 +- 17 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R/epi_self_match.R b/R/epi_self_match.R index c86b17f..75707d1 100644 --- a/R/epi_self_match.R +++ b/R/epi_self_match.R @@ -16,7 +16,7 @@ #' #' @examples #' ## Loading Data -#' library('epimatch') +#' #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) @@ -75,7 +75,7 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv #' @export #' @examples #' ## Loading Data -#' library('epimatch') +#' #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/R/matchAge.R b/R/matchAge.R index f15f357..b559cf2 100644 --- a/R/matchAge.R +++ b/R/matchAge.R @@ -14,7 +14,7 @@ #' @export #' #' @examples -#' library('epimatch') +#' #' set.seed(999) #' x <- data.frame(age = rpois(5, 30), age_class = "YEAR") #' y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/R/matchDate.R b/R/matchDate.R index 860dbd8..1d15c41 100644 --- a/R/matchDate.R +++ b/R/matchDate.R @@ -14,7 +14,7 @@ #' @importFrom lubridate parse_date_time #' #' @examples -#' library('epimatch') +#' #' #' # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. #' # Note that the way the dates can vary within data sets, but it's assumed diff --git a/R/matchGender.R b/R/matchGender.R index fa6d17c..29a1a8d 100644 --- a/R/matchGender.R +++ b/R/matchGender.R @@ -11,7 +11,7 @@ #' @export #' #' @examples -#' library('epimatch') +#' #' test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) #' genderDists(test) genderDists <- function(dat1, dat2=NULL, diff --git a/R/matchGeneric.R b/R/matchGeneric.R index 025e045..4d70b10 100644 --- a/R/matchGeneric.R +++ b/R/matchGeneric.R @@ -7,7 +7,7 @@ #' @export #' #' @examples -#' library('epimatch') +#' #' set.seed(9) #' x <- data.frame(dat = sample(10, replace = TRUE)) #' x$let <- letters[x$dat] diff --git a/R/matchNames.R b/R/matchNames.R index a7f1314..a57c339 100644 --- a/R/matchNames.R +++ b/R/matchNames.R @@ -8,7 +8,7 @@ #' @export #' #' @examples -#' library('epimatch') +#' #' set.seed(9) #' x <- data.frame(x = letters, y = LETTERS, z = 1:26) #' x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/R/process_matching.R b/R/process_matching.R index 6cb4806..520acea 100644 --- a/R/process_matching.R +++ b/R/process_matching.R @@ -20,7 +20,7 @@ #' @export #' #' @examples -#' library('epimatch') +#' #' ## Loading Data #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index 2f95679..5d9da97 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -21,7 +21,7 @@ #' #' @examples #' ## Loading Data -#' library('epimatch') +#' #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/man/ageDists.Rd b/man/ageDists.Rd index 6c9418f..a4d4e05 100644 --- a/man/ageDists.Rd +++ b/man/ageDists.Rd @@ -27,7 +27,7 @@ This function will take one or two data frames containing ages data frame with a specifier for months or year in the second column. } \examples{ -library('epimatch') + set.seed(999) x <- data.frame(age = rpois(5, 30), age_class = "YEAR") y <- data.frame(age = rpois(5, 18), age_class = "MONTH") diff --git a/man/dateDists.Rd b/man/dateDists.Rd index 6857dbc..2056cbc 100644 --- a/man/dateDists.Rd +++ b/man/dateDists.Rd @@ -27,7 +27,7 @@ The formats for dates are passed to \pkg{lubridate}'s function "ymd", etc. } \examples{ -library('epimatch') + # Two data sets in Month-Day-Year and Year-Month-Day format, respectively. # Note that the way the dates can vary within data sets, but it's assumed diff --git a/man/genderDists.Rd b/man/genderDists.Rd index 60e5c38..dfe1204 100644 --- a/man/genderDists.Rd +++ b/man/genderDists.Rd @@ -27,7 +27,7 @@ A thin wrapper around generic, which converts some known male and female codes to binary } \examples{ -library('epimatch') + test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE) genderDists(test) } diff --git a/man/genericDists.Rd b/man/genericDists.Rd index a41b95a..46a4d27 100644 --- a/man/genericDists.Rd +++ b/man/genericDists.Rd @@ -18,7 +18,7 @@ a pairwise matrix of scores from 0 (exact match) to 1 (no match) generic function for exact matching single columns } \examples{ -library('epimatch') + set.seed(9) x <- data.frame(dat = sample(10, replace = TRUE)) x$let <- letters[x$dat] diff --git a/man/getIndexList.Rd b/man/getIndexList.Rd index 20c708f..a7ea751 100644 --- a/man/getIndexList.Rd +++ b/man/getIndexList.Rd @@ -19,7 +19,7 @@ of indices. } \examples{ ## Loading Data -library('epimatch') + indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/man/matchEpiData.Rd b/man/matchEpiData.Rd index f3e2a4f..49dabfc 100644 --- a/man/matchEpiData.Rd +++ b/man/matchEpiData.Rd @@ -42,7 +42,7 @@ this function will take in one or two data sets, a list of functions } \examples{ ## Loading Data -library('epimatch') + indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) diff --git a/man/nameDists.Rd b/man/nameDists.Rd index c79dd12..08aa219 100644 --- a/man/nameDists.Rd +++ b/man/nameDists.Rd @@ -20,7 +20,7 @@ Clean and match names column by removing punctuation and replacing punctuation with an underscore } \examples{ -library('epimatch') + set.seed(9) x <- data.frame(x = letters, y = LETTERS, z = 1:26) x <- x[sample(nrow(x), 10, replace = TRUE), ] diff --git a/man/processFunctionList.Rd b/man/processFunctionList.Rd index f213384..3c8b097 100644 --- a/man/processFunctionList.Rd +++ b/man/processFunctionList.Rd @@ -29,7 +29,7 @@ column names, and function parameters, and return a list of matrices by function. } \examples{ -library('epimatch') + ## Loading Data indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index f35eca4..448b816 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -47,7 +47,7 @@ This will collect all of the data from \code{\link{matchEpiData}} } \examples{ ## Loading Data -library('epimatch') + indata <- system.file("files", package = "epimatch") indata <- dir(indata, full.names = TRUE) x <- lapply(indata, read.csv, stringsAsFactors = FALSE) From 453d98caf96a9736f3fe15e2557aa853db89d49f Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 6 Aug 2016 10:14:07 -0700 Subject: [PATCH 13/16] update Rproj specifications --- epimatch.Rproj | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epimatch.Rproj b/epimatch.Rproj index cba1b6b..7c89849 100644 --- a/epimatch.Rproj +++ b/epimatch.Rproj @@ -17,5 +17,6 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source +PackageInstallArgs: --no-multiarch --with-keep.source --install-tests +PackageCheckArgs: --as-cran PackageRoxygenize: rd,collate,namespace From 080b7d729464f002fae087ba62b01fbe22ad02a3 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 6 Aug 2016 10:15:10 -0700 Subject: [PATCH 14/16] Add information to list of tables This addresses Dean's comment: https://github.com/Hackout3/epimatch/pull/12#issuecomment-238000614 --- R/tableFromMatch.R | 52 ++++++++++++++++++++---------------------- man/tablesFromMatch.Rd | 17 +++++++------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index 5d9da97..010f0a2 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -5,23 +5,25 @@ #' \code{NULL}, a threshold can be supplied instead to calculate the list on #' the fly. #' @param collapse When \code{TRUE}, the list of data frames will be collapsed -#' into one data frame with three or four extra columns specifying: +#' into one data frame an extra column specifying the group appended. +#' +#' @details This will collect all of the data from \code{\link{matchEpiData}} +#' and present it in table format. It will collapse them into tidy tables and, +#' if provided, the score of the matches will be provided. +#' @return a list of data frames sorted in decreasing order of matching-ness +#' with three extra columns: #' \describe{ -#' \item{groups}{ match group in decreasing order of score} -#' \item{dataset}{ data set of origin} +#' \item{source}{ data set of origin} #' \item{index}{ index in data set} #' \item{score}{ the score of the matches to the first item in the group.} #' } +#' When \code{collapse = TRUE}, a fourth column, \code{groups} is appended. #' -#' @details This will collect all of the data from \code{\link{matchEpiData}} -#' and present it in table format. It will collapse them into tidy tables and, -#' if provided, the score of the matches will be provided. -#' @return a list of data frames sorted in decreasing order of matching-ness. #' @export #' #' @examples #' ## Loading Data -#' +#' #' indata <- system.file("files", package = "epimatch") #' indata <- dir(indata, full.names = TRUE) #' x <- lapply(indata, read.csv, stringsAsFactors = FALSE) @@ -73,7 +75,7 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), # If the incoming list contains numeric values, collect the thresholds and # convert the list to indices. if (!is.integer(unlist(matchList))){ - theThresholds <- unlist(matchList) + theThresholds <- matchList matchList <- getIndexList(matchList) } @@ -93,15 +95,26 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), names(d2) <- names(d1) } - # The output is a list of data frames subset in decreasing order of matching. + ## The output is a list of data frames subset in decreasing order of matching. outlist <- vector(length = length(matchList), mode = "list") for (i in seq(matchList)){ indices <- matchList[[i]] if (dat2exists){ - outlist[[i]] <- rbind(d1[indices$d1, ], d2[indices$d2, ]) + outdat <- rbind(d1[indices$d1, ], d2[indices$d2, ]) } else { - outlist[[i]] <- d1[indices$d1, ] + outdat <- d1[indices$d1, ] } + ## Give the indices within each data set ------------------------ + outdat$index <- unlist(indices, use.names = FALSE) + + ## Give the source of the data ---------------------------------- + outdat$source <- rep(c("d1", "d2"), lapply(indices, length)) + + ## If the incoming list are thresholds, add them as a column ---- + if (exists("theThresholds", inherits = FALSE)){ + outdat$score <- unlist(theThresholds[[i]]) + } + outlist[[i]] <- outdat } if (collapse){ @@ -113,24 +126,9 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), groups <- lapply(lapply(matchList, unlist), length) groups <- rep(seq_along(matchList), groups) - ## Give the source of the data ---------------------------------- - data_length <- unlist(lapply(matchList, lapply, length)) - data_source <- rep(c("d1", "d2"), length(matchList)) - data_source <- rep(data_source, data_length) - - ## Give the indices within each data set ------------------------ - indices <- unlist(matchList) - ## Create the resulting tidy data frame and add information ----- outlist <- do.call("rbind", outlist) outlist$groups <- groups - outlist$dataset <- data_source - outlist$index <- indices - - ## If the incoming list are thresholds, add them as a column ---- - if (exists("theThresholds", inherits = FALSE)){ - outlist$score <- theThresholds - } } return(outlist) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index 448b816..4a6ef8f 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -26,16 +26,17 @@ tablesFromMatch(dat1, dat2 = NULL, funlist = list(), matchList = NULL, the fly.} \item{collapse}{When \code{TRUE}, the list of data frames will be collapsed -into one data frame with three or four extra columns specifying: -\describe{ - \item{groups}{ match group in decreasing order of score} - \item{dataset}{ data set of origin} - \item{index}{ index in data set} - \item{score}{ the score of the matches to the first item in the group.} -}} +into one data frame an extra column specifying the group appended.} } \value{ -a list of data frames sorted in decreasing order of matching-ness. +a list of data frames sorted in decreasing order of matching-ness + with three extra columns: + \describe{ + \item{source}{ data set of origin} + \item{index}{ index in data set} + \item{score}{ the score of the matches to the first item in the group.} + } + When \code{collapse = TRUE}, a fourth column, \code{groups} is appended. } \description{ Generate a list of tables aggregating the matched values From 98528a50e7d407670b9f6d49ddc1a43562f19896 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 6 Aug 2016 10:18:11 -0700 Subject: [PATCH 15/16] Remove rownames and rename groups to group This addresses Dean's comment: https://github.com/Hackout3/epimatch/pull/12#issuecomment-237997712 --- R/tableFromMatch.R | 6 ++++-- man/tablesFromMatch.Rd | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/R/tableFromMatch.R b/R/tableFromMatch.R index 010f0a2..243e7ba 100644 --- a/R/tableFromMatch.R +++ b/R/tableFromMatch.R @@ -17,7 +17,7 @@ #' \item{index}{ index in data set} #' \item{score}{ the score of the matches to the first item in the group.} #' } -#' When \code{collapse = TRUE}, a fourth column, \code{groups} is appended. +#' When \code{collapse = TRUE}, a fourth column, \code{group} is appended. #' #' @export #' @@ -114,6 +114,7 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), if (exists("theThresholds", inherits = FALSE)){ outdat$score <- unlist(theThresholds[[i]]) } + rownames(outdat) <- NULL outlist[[i]] <- outdat } @@ -128,7 +129,8 @@ tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(), ## Create the resulting tidy data frame and add information ----- outlist <- do.call("rbind", outlist) - outlist$groups <- groups + outlist$group <- groups + rownames(outlist) <- NULL } return(outlist) diff --git a/man/tablesFromMatch.Rd b/man/tablesFromMatch.Rd index 4a6ef8f..eb5b582 100644 --- a/man/tablesFromMatch.Rd +++ b/man/tablesFromMatch.Rd @@ -36,7 +36,7 @@ a list of data frames sorted in decreasing order of matching-ness \item{index}{ index in data set} \item{score}{ the score of the matches to the first item in the group.} } - When \code{collapse = TRUE}, a fourth column, \code{groups} is appended. + When \code{collapse = TRUE}, a fourth column, \code{group} is appended. } \description{ Generate a list of tables aggregating the matched values From 74aa866d15813fc44524f8d3810382fa7295c890 Mon Sep 17 00:00:00 2001 From: Zhian Kamvar Date: Sat, 6 Aug 2016 10:56:57 -0700 Subject: [PATCH 16/16] Add tests for tablesFromMatch --- tests/testthat/test-export.R | 55 ++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/testthat/test-export.R diff --git a/tests/testthat/test-export.R b/tests/testthat/test-export.R new file mode 100644 index 0000000..37b1b36 --- /dev/null +++ b/tests/testthat/test-export.R @@ -0,0 +1,55 @@ +context("tablesFromMatch tests") + +indata <- system.file("files", package = "epimatch") +indata <- dir(indata, full.names = TRUE) +x <- lapply(indata, read.csv, stringsAsFactors = FALSE) +names(x) <- basename(indata) + +# We will use one data set from the case information and lab results +case <- x[["CaseInformationForm.csv"]] +lab <- x[["LaboratoryResultsForm7.csv"]] + +funlist <- list( + list(d1vars = "ID", + d2vars = "ID", + fun = "nameDists", + extraparams = NULL, + weight = 1), + list(d1vars = c("Surname", "OtherNames"), + d2vars = c("SurnameLab", "OtherNameLab"), + fun = "nameDists", + extraparams = NULL, + weight = 0.5) + ) +res <- matchEpiData(dat1 = case, + dat2 = lab, + funlist = funlist, + thresh = 0.25, + giveWeight = TRUE) +l_res <- tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE) +t_res <- tablesFromMatch(case, lab, funlist, matchList = 0.25) + +## This not only tests the conversion, but since the input for the tests above +## are the same, this also tests to ensure that the matchList argument can take +## the result of matchEpiData AND a threshold. +test_that("list output and collapsed output are the same", { + expect_equal(sum(vapply(l_res, nrow, 1L)), nrow(t_res)) + expect_equal(do.call("rbind", l_res), t_res[names(l_res[[1]])]) +}) + +test_that("tablesFromMatch returns a list when collapse = FALSE", { + expect_is(l_res, "list") + for (i in seq_along(l_res)){ + dat <- l_res[[i]] + expect_is(dat, "data.frame", info = paste("group", i)) + expect_equal(ncol(dat), ncol(l_res[[i]])) + } +}) + +test_that("the data frame is tidy", { + expect_is(t_res, "data.frame") + expect_identical(names(t_res)[length(t_res)], "group") + expect_equal(length(unique(t_res$group)), length(l_res)) + expect_equal(nrow(t_res), 20L) + expect_equal(ncol(t_res), 6L) +})