Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ export(dateDists)
export(distFuns)
export(genderDists)
export(genericDists)
export(getIndexList)
export(launch)
export(locationDists)
export(matchEpiData)
export(nameDists)
export(processFunctionList)
export(returnMatches)
export(tablesFromMatch)
importFrom(lubridate,parse_date_time)
importFrom(shiny,runApp)
importFrom(stats,dist)
Expand Down
89 changes: 68 additions & 21 deletions R/epi_self_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#'
#' @examples
#' ## Loading Data
#'
#' indata <- system.file("files", package = "epimatch")
#' indata <- dir(indata, full.names = TRUE)
#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
Expand All @@ -25,32 +26,29 @@
#' case <- x[["CaseInformationForm.csv"]]
#' lab <- x[["LaboratoryResultsForm7.csv"]]
#'
#' funlist <- list(
#' list(d1vars = "ID",
#' d2vars = "ID",
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 1),
#' list(d1vars = c("Surname", "OtherNames"),
#' d2vars = c("SurnameLab", "OtherNameLab"),
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 0.5)
#' )
#' # This will get all of the indices that match the ID and Names with a
#' # threshold of 0.25
#' res <- matchEpiData(dat1 = case,
#' dat2 = lab,
#' funlist = list(
#' list(d1vars = "ID",
#' d2vars = "ID",
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 1),
#' list(d1vars = c("Surname", "OtherNames"),
#' d2vars = c("SurnameLab", "OtherNameLab"),
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 0.5)
#' ),
#' thresh = 0.25)
#' funlist = funlist,
#' thresh = 0.25,
#' giveWeight = FALSE)
#' # List of indices
#' res
#'
#' # Printing out the matching names in decreasing order of matching
#' invisible(lapply(res, function(i) {
#' print(case[i$d1, c("Surname", "OtherNames")])
#' print(lab[i$d2, c("SurnameLab", "OtherNameLab")])
#' cat("\n\t--------\n")
#' }))
#' tablesFromMatch(case, lab, funlist, matchList = res)
matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giveWeight = FALSE){
the_matrices <- processFunctionList(dat1, dat2, funlist)
the_weights <- unlist(lapply(funlist, function(i) i$weight))
Expand All @@ -59,8 +57,57 @@ matchEpiData <- function(dat1, dat2 = NULL, funlist = list(), thresh = 0.05, giv
# For processing in the shiny app, it's useful to display either the weights
# as a named index vector or the indices themselves
if (!giveWeight){
names(out) <- NULL
out <- lapply(out, lapply, function(i) as.integer(names(i)))
out <- getIndexList(out)
}
return(out)
}

#' return a list of indices from matchEpiData
#'
#' The output of matchEpiData can either be a list of indices or a named list of
#' weights. If it's a named list of weights, this function will return the list
#' of indices.
#'
#' @param matchList a list of lists of weights
#'
#' @return an unnamed list of lists of indices
#' @keywords internal utilities
#' @export
#' @examples
#' ## Loading Data
#'
#' indata <- system.file("files", package = "epimatch")
#' indata <- dir(indata, full.names = TRUE)
#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
#' names(x) <- basename(indata)
#'
#' # We will use one data set from the case information and lab results
#' case <- x[["CaseInformationForm.csv"]]
#' lab <- x[["LaboratoryResultsForm7.csv"]]
#'
#' funlist <- list(
#' list(d1vars = "ID",
#' d2vars = "ID",
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 1),
#' list(d1vars = c("Surname", "OtherNames"),
#' d2vars = c("SurnameLab", "OtherNameLab"),
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 0.5)
#' )
#' # This will get all of the indices that match the ID and Names with a
#' # threshold of 0.25
#' res <- matchEpiData(dat1 = case,
#' dat2 = lab,
#' funlist = funlist,
#' thresh = 0.25,
#' giveWeight = TRUE)
#' res # List of weights
#' getIndexList(res) # List of indices
getIndexList <- function(matchList){
out <- lapply(matchList, lapply, function(i) as.integer(names(i)))
names(out) <- NULL
return(out)
}
1 change: 1 addition & 0 deletions R/epimatch-package.r
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#' \itemize{
#' \item \code{\link{processFunctionList}}
#' \item \code{\link{matchEpiData}}
#' \item \code{\link{tablesFromMatch}}
#' }
#'
#' @section Dissimilarity Functions: Each dissimilarity function returns a
Expand Down
24 changes: 24 additions & 0 deletions R/internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,27 @@ splitComma <- function(dat)
}
return(new_dat)
}

#' Collapse columns of a data frame into one
#'
#' @param df a data frame with one or more columns
#' @param sep a separator to collapse the columns with
#'
#' @return a single column data frame
#' @keywords internal
#' @noRd
#'
#' @examples
#' dat <- data.frame(a = letters, b = sample(100, 26), c = LETTERS)
#' collapseValues(dat)
collapseValues <- function(df, sep = "_"){
if (!is.data.frame(df)){
stop("a data frame is needed")
}
if (length(df) > 1){
dfnames <- paste(names(df), collapse = sep)
df <- data.frame(apply(df, 1, paste, collapse = sep), stringsAsFactors = FALSE)
names(df) <- dfnames
}
return(df)
}
1 change: 1 addition & 0 deletions R/matchAge.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#' @export
#'
#' @examples
#'
#' set.seed(999)
#' x <- data.frame(age = rpois(5, 30), age_class = "YEAR")
#' y <- data.frame(age = rpois(5, 18), age_class = "MONTH")
Expand Down
1 change: 1 addition & 0 deletions R/matchDate.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#' @importFrom lubridate parse_date_time
#'
#' @examples
#'
#'
#' # Two data sets in Month-Day-Year and Year-Month-Day format, respectively.
#' # Note that the way the dates can vary within data sets, but it's assumed
Expand Down
1 change: 1 addition & 0 deletions R/matchGender.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @export
#'
#' @examples
#'
#' test <- data.frame(c("male", "f", "m"), stringsAsFactors = FALSE)
#' genderDists(test)
genderDists <- function(dat1, dat2=NULL,
Expand Down
1 change: 1 addition & 0 deletions R/matchGeneric.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#' @export
#'
#' @examples
#'
#' set.seed(9)
#' x <- data.frame(dat = sample(10, replace = TRUE))
#' x$let <- letters[x$dat]
Expand Down
1 change: 1 addition & 0 deletions R/matchNames.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @export
#'
#' @examples
#'
#' set.seed(9)
#' x <- data.frame(x = letters, y = LETTERS, z = 1:26)
#' x <- x[sample(nrow(x), 10, replace = TRUE), ]
Expand Down
1 change: 1 addition & 0 deletions R/process_matching.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' @export
#'
#' @examples
#'
#' ## Loading Data
#' indata <- system.file("files", package = "epimatch")
#' indata <- dir(indata, full.names = TRUE)
Expand Down
137 changes: 137 additions & 0 deletions R/tableFromMatch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#' Generate a list of tables aggregating the matched values
#'
#' @inheritParams processFunctionList
#' @param matchList a list derived from \code{\link{matchEpiData}}. If this is
#' \code{NULL}, a threshold can be supplied instead to calculate the list on
#' the fly.
#' @param collapse When \code{TRUE}, the list of data frames will be collapsed
#' into one data frame an extra column specifying the group appended.
#'
#' @details This will collect all of the data from \code{\link{matchEpiData}}
#' and present it in table format. It will collapse them into tidy tables and,
#' if provided, the score of the matches will be provided.
#' @return a list of data frames sorted in decreasing order of matching-ness
#' with three extra columns:
#' \describe{
#' \item{source}{ data set of origin}
#' \item{index}{ index in data set}
#' \item{score}{ the score of the matches to the first item in the group.}
#' }
#' When \code{collapse = TRUE}, a fourth column, \code{group} is appended.
#'
#' @export
#'
#' @examples
#' ## Loading Data
#'
#' indata <- system.file("files", package = "epimatch")
#' indata <- dir(indata, full.names = TRUE)
#' x <- lapply(indata, read.csv, stringsAsFactors = FALSE)
#' names(x) <- basename(indata)
#'
#' # We will use one data set from the case information and lab results
#' case <- x[["CaseInformationForm.csv"]]
#' lab <- x[["LaboratoryResultsForm7.csv"]]
#'
#' funlist <- list(
#' list(d1vars = "ID",
#' d2vars = "ID",
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 1),
#' list(d1vars = c("Surname", "OtherNames"),
#' d2vars = c("SurnameLab", "OtherNameLab"),
#' fun = "nameDists",
#' extraparams = NULL,
#' weight = 0.5)
#' )
#' # This will get all of the indices that match the ID and Names with a
#' # threshold of 0.25
#' res <- matchEpiData(dat1 = case,
#' dat2 = lab,
#' funlist = funlist,
#' thresh = 0.25,
#' giveWeight = TRUE)
#' tablesFromMatch(case, lab, funlist, matchList = res, collapse = FALSE)
#' tablesFromMatch(case, lab, funlist, matchList = 0.25)
tablesFromMatch <- function(dat1, dat2 = NULL, funlist = list(),
matchList = NULL, collapse = TRUE){

# Check if incoming data are cromulent
dat2exists <- !is.null(dat2) && is.data.frame(dat2)
if (!is.data.frame(dat1) | !dat2exists){
stop("data must be in data frame format")
}
# Check if there are any matches at all
if (is.null(matchList) || length(matchList) == 0){
stop("no matches found!")
}
# If the matchList is a threshold parameter, pass it to the matchEpiData
# function.
if (is.numeric(matchList) && length(matchList) == 1){
matchList <- matchEpiData(dat1, dat2, funlist = funlist, thresh = matchList,
giveWeight = TRUE)
}
# If the incoming list contains numeric values, collect the thresholds and
# convert the list to indices.
if (!is.integer(unlist(matchList))){
theThresholds <- matchList
matchList <- getIndexList(matchList)
}

# Grab the tested variables from dat1
d1vars <- lapply(funlist, "[[", "d1vars")
d1 <- lapply(d1vars, function(i) dat1[i])
# Since multiple columns can be evaluated by a single function, we are going
# to collapse these columns into one.
d1 <- data.frame(lapply(d1, collapseValues))

if (dat2exists){
# Grab the tested variables from dat2 and rename them based off of dat1.
# This allows us to bind the data together a lot easier.
d2vars <- lapply(funlist, "[[", "d2vars")
d2 <- lapply(d2vars, function(i) dat2[i])
d2 <- data.frame(lapply(d2, collapseValues))
names(d2) <- names(d1)
}

## The output is a list of data frames subset in decreasing order of matching.
outlist <- vector(length = length(matchList), mode = "list")
for (i in seq(matchList)){
indices <- matchList[[i]]
if (dat2exists){
outdat <- rbind(d1[indices$d1, ], d2[indices$d2, ])
} else {
outdat <- d1[indices$d1, ]
}
## Give the indices within each data set ------------------------
outdat$index <- unlist(indices, use.names = FALSE)

## Give the source of the data ----------------------------------
outdat$source <- rep(c("d1", "d2"), lapply(indices, length))

## If the incoming list are thresholds, add them as a column ----
if (exists("theThresholds", inherits = FALSE)){
outdat$score <- unlist(theThresholds[[i]])
}
rownames(outdat) <- NULL
outlist[[i]] <- outdat
}

if (collapse){
# This is where we make the data tidy by placing everything in a single data
# frame and adding three extra columns that contain the information that was
# in the result from `matchEpiData()`.

## Give the indices of the highest list -------------------------
groups <- lapply(lapply(matchList, unlist), length)
groups <- rep(seq_along(matchList), groups)

## Create the resulting tidy data frame and add information -----
outlist <- do.call("rbind", outlist)
outlist$group <- groups
rownames(outlist) <- NULL
}

return(outlist)
}
3 changes: 2 additions & 1 deletion epimatch.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageInstallArgs: --no-multiarch --with-keep.source --install-tests
PackageCheckArgs: --as-cran
PackageRoxygenize: rd,collate,namespace
6 changes: 4 additions & 2 deletions inst/shiny/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,13 @@ function(input, output, session) {
hide("findMatchesLoading")
})
tryCatch({
values$results <- epimatch::matchEpiData(
theMatches <- epimatch::matchEpiData(
dat1 = values$data1, dat2 = values$data2,
funlist = funlist,
thresh = input$threshold
thresh = input$threshold,
giveWeight = TRUE
)
values$results <- epimatch::getIndexList(theMatches)
show(selector = ".findMatchesDone")
delay(1000, hide(selector = ".findMatchesDone", anim = TRUE, animType = "fade",
time = 0.5))
Expand Down
1 change: 1 addition & 0 deletions man/ageDists.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/dateDists.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading