# clean.R # T. J. Finney, 2014-01-11. # This incorporates code suggested by Bill Venables. message("Clean a data matrix by dropping deficient objects...") message("If a reference object is specified then it is preserved if possible.") message("A deficient object is one which causes any possible pair of objects to have") message("less than a required number of places where both are defined (i.e. not NA).") # Reset. rm(list=ls(all=TRUE)) # Set parameters. message("Input file: ", infile <- "../data/Mark-UBS4-A.csv") message("Output file: ", outfile <- "../data/Mark-UBS4.csv") message("Minimum required number of pairwise defined places: ", req <- 15) message("Reference object: ", ref <- "") # Set writing mode: [1] = write results; [2] = do not write results. message("Write mode: ", write <- c(TRUE, FALSE)[1]) # Read input (assumed to be nominal data). input.fr <- read.csv(infile, row.names=1, colClasses="factor") # Make pairwise defined counts matrix. input.mx <- data.matrix(input.fr) input.mx[] <- !is.na(input.mx) counts.mx <- tcrossprod(input.mx) # Stage 1: message("Stage 1: drop objects without enough defined states:") drop <- (diag(counts.mx) < req) message(paste(rownames(input.fr)[drop], collapse=" ")) st1.fr <- input.fr[!drop,] counts.mx <- counts.mx[!drop, !drop] # Check ref. object. if (ref != "") { if (!(ref %in% rownames(input.fr))) stop("Input data matrix lacks reference object.", call.=FALSE) else if (!(ref %in% rownames(st1.fr))) stop("Reference object deficient.", call.=FALSE) } # Stage 2: message("Stage 2: drop objects without enough pairwise defined states:") while((m <- min(counts.mx)) < req) { # List objects with least counts. ls <- which(counts.mx == m, arr.ind=TRUE)[,1] # Choose worst defined object which is not ref. wd <- sort(diag(counts.mx)[ls[names(ls) != ref]])[1] i <- which(names(ls) == names(wd))[1] # Drop it. counts.mx <- counts.mx[-ls[i], -ls[i]] #cat("drop", names(ls[i]), "with min. of", m, "\n") } drop <- !(rownames(st1.fr) %in% rownames(counts.mx)) message(paste(rownames(st1.fr)[drop], collapse=" ")) st2.fr <- st1.fr[!drop,] # Write results. if (write) { message("Write ", outfile) write.csv(st2.fr, outfile) } else { message("Would write ", outfile) }