# MAR-survey.r # T. J. Finney, 2011-10-20. message("Produce MDS and DC results from data compiled by Maurice Robinson.") # Set parameters. message("Input directory: ", dir_in <- "../data/") message("Input table (percentage agreements): ", tab_in <- "Matt-UBS2.csv") message("Input counts (numbers of cards): ", counts_in <- "Matt-UBS2-counts.csv") message("Output directory for distance matrix and counts list: ", dir_dist <- "../dist/") message("Output directory for MDS result: ", dir_MDS <- "../cmds/") message("Output directory for DC result: ", dir_DC <- "../dc/") message("Minimum tolerable number of shared passages: ", tolerable <- 15) # Set a reference witness to force its inclusion. (Use empty string "" if none required.) message("Reference witness: ", ref <- "A") # Set writing mode: TRUE = write results; FALSE = do not write results. message("Write mode: ", write <- c(TRUE, FALSE)[1]) # Functions # drop_missing_sq(frame, col_name) # Drop columns with missing data (NAs) from a square data frame. # * col_name specifies a column to keep. # * This reduces the data frame by iteratively dropping one column and # corresponding row at a time. # * On every iteration the two columns with the highest number of NAs are # identified. # * If the first is the column to keep then the other is dropped. # # frame: A square data frame. # col_name: A column name to keep. # Return: A square data frame with no missing data. drop_missing_sq <- function(frame, col_name) { R <- dim(frame)[1] C <- dim(frame)[2] stopifnot(R==C) if (col_name != "") stopifnot(sum(is.na(frame[[col_name]])) < C) NAs <- vector(length=C) for (c in 1:C) { NAs[c] <- sum(is.na(frame[c])) } sorted <- sort(NAs, decreasing=TRUE) # If no NAs then return frame; otherwise drop col and row then recurse. if (sorted[1] == 0) frame else { v1 <- which(NAs==sorted[1]) v2 <- which(NAs==sorted[2]) first <- v1[1] second <- if (length(v1) > 1) v1[2] else v2[1] drop <- if (rownames(frame)[first] == col_name) second else first message(rownames(frame)[drop]) keep <- rep(TRUE, times=C) keep[drop] <- FALSE drop_missing_sq(frame[keep, keep], col_name) } } # Script # Read input. dist_in <- paste(c(dir_in, tab_in), collapse="") counts_in <- paste(c(dir_in, counts_in), collapse="") # "0" (missing data) -> NA dist <- read.csv(dist_in, row.names=1, na.strings="0", check.names=FALSE) counts <- read.csv(counts_in, row.names=1, check.names=FALSE) # Remove superfluous rows from counts. counts <- subset(counts, row.names(counts) %in% row.names(dist)) # Do conversion: # * "X" (100% agreement) -> 100. # * fill upper triangle. dist <- as.matrix(dist) R <- dim(dist)[1] for (r in 1:R) { dist[r, r:R] <- dist[r:R, r] dist[r, r] <- 100 } dist <- as.data.frame(dist) # Drop witnesses without enough passages. keep <- counts[,1] >= tolerable drop <- (1:R)[!keep] keep <- (1:R)[keep] message("Drop witnesses without enough passages:") message(paste(row.names(dist)[drop], collapse=" ")) dist <- dist[keep, keep] # Drop witnesses which still have missing data. message("Drop witnesses which still have missing data:") dist <- drop_missing_sq(dist, ref) # Calculate average number of variation units. counts <- subset(counts, row.names(counts) %in% row.names(dist)) message("Mean number of variation units (rounded): ", floor(mean(counts[,1]))) # Convert to distance object. R <- dim(dist)[1] names <- row.names(dist) dist <- matrix(as.numeric(as.matrix(dist)), nrow=R) rownames(dist) <- colnames(dist) <- names distances <- as.dist((100 - dist) / 100) message("Mean distance: ", round(mean(distances), digits = 3)) # Shut down open graphics devices. graphics.off() # Perform MDS analysis. require(cluster) MDS <- cmdscale(distances, k=3, eig=TRUE) x <- MDS$points[,1] y <- MDS$points[,2] z <- MDS$points[,3] require(rgl) plot3d(x, y, z, xlab="axis 1", ylab="axis 2", zlab="axis 3", type='n', axes=TRUE, box=TRUE) text3d(x, y, z, rownames(MDS$points), col=4) message("Proportion of variance: ", round(MDS$GOF[1], digits = 2)) # Perform DC analysis. require(cluster) DC <- diana(distances) message("Divisive coefficient: ", round(DC$dc, digits = 2)) plot(DC, which=2, main="", cex=0.8) # Write results. if (write) { parts <- c( sub(".csv", "", tab_in), if (ref == "") NULL else ref, as.character(tolerable) ) name <- paste(parts, collapse=".") out_dist <- paste(c(dir_dist, name, ".SMD.csv"), collapse="") out_counts <- paste(c(dir_dist, name, ".counts.csv"), collapse="") out_DC <- paste(c(dir_DC, name, ".png"), collapse="") write.csv(round(as.matrix(distances), digits=3), out_dist) write.csv(as.matrix(counts), out_counts) #movie3d(spin3d(rpm=10), duration=6, dir=dir_MDS, movie=name) #dev.print(png, file=out_DC, width=975, height=600) } # Clean up. #rm(list=ls(all=TRUE))