# clean.r # T. J. Finney, 2014-01-11. # This incorporates code suggested by Bill Venables. message("Clean a data matrix by dropping deficient cases...") message("If a reference case is specified then it is preserved if possible.") message("A deficient case is one which causes any possible pair of cases to have less than a required number of places where both are defined (i.e. not NA).") # Reset. rm(list=ls(all=TRUE)) # Set parameters. message("Input data matrix: ", file.name <- "../data/Mark-UBS4.csv") message("Output directory: ", output.path <- "../data/clean/") message("Minimum required number of pairwise defined places: ", req <- 15) message("Reference case: ", ref <- "it-k") # Set writing mode: [1] = write results; [2] = do not write results. message("Write mode: ", write <- c(TRUE, FALSE)[1]) # Read input (assumed to be nominal data). input.fr <- read.csv(file.name, row.names=1, colClasses="factor") # Make pairwise defined counts matrix. input.mx <- data.matrix(input.fr) input.mx[] <- !is.na(input.mx) counts.mx <- tcrossprod(input.mx) # Stage 1: message("Stage 1: drop objects without enough defined states:") drop <- (diag(counts.mx) < req) message(paste(rownames(input.fr)[drop], collapse=" ")) st1.fr <- input.fr[!drop,] counts.mx <- counts.mx[!drop, !drop] # Check ref. object. if (ref != "") { if (!(ref %in% rownames(input.fr))) stop("Input data matrix lacks reference object.", call.=FALSE) else if (!(ref %in% rownames(st1.fr))) stop("Reference object deficient.", call.=FALSE) } # Stage 2: message("Stage 2: drop objects without enough pairwise defined states:") while((m <- min(counts.mx)) < req) { # List objects with least counts. ls <- which(counts.mx == m, arr.ind=TRUE)[,1] # Choose worst defined object which is not ref. wd <- sort(diag(counts.mx)[ls[names(ls) != ref]])[1] i <- which(names(ls) == names(wd))[1] # Drop it. counts.mx <- counts.mx[-ls[i], -ls[i]] #cat("drop", names(ls[i]), "with min. of", m, "\n") } drop <- !(rownames(st1.fr) %in% rownames(counts.mx)) message(paste(rownames(st1.fr)[drop], collapse=" ")) st2.fr <- st1.fr[!drop,] # Write results. parts <- c( sub(".csv", "", sub(".*/", "", file.name)), if (ref == "") NULL else ref, as.character(req) ) file.name <- paste(c(output.path, paste(parts, collapse="."), ".csv"), collapse="") if (write) { message("Write ", file.name) write.csv(st2.fr, file.name) } else { message("Would write ", file.name) }