--
You received this message because you are subscribed to the ggplot2 mailing list.
Please provide a reproducible example: http://gist.github.com/270442
To post: email ggp...@googlegroups.com
To unsubscribe: email ggplot2+u...@googlegroups.com
More options: http://groups.google.com/group/ggplot2
iris_mclust=Mclust(iris[,1:4])
get.ellipses <- function(coords, mclust.fit){
centers <- mclust.fit$parameters$mean[coords, ]
vars <- mclust.fit$parameters$variance$sigma[coords, coords, ]
ldply(1:ncol(centers), function(cluster){
data.frame(ellipse(vars[,,cluster], centre = centers[, cluster],
level = 0.5), classification = cluster)
})
}
iris.el <- get.ellipses(c("Sepal.Length", "Sepal.Width"), iris_mclust)
iris$classification <- iris_mclust$classification
ggplot(iris, aes(Sepal.Length, Sepal.Width, colour = factor(classification))) +
geom_point(aes(shape = classification))+
geom_path(data = iris.el,
aes(group = classification, linetype = classification))
The size of the ellipses is the tricky part, from what I can tell
mclust uses some classification probability level. I just used the 50%
quantile for an equivalent multivariate normal (the level parameter in
ellipse).
Hope it helps,
Charlotte
require(mclust)
require(ggplot2)
require(ellipse)
#if function, input is original dataset
orig.data<-iris[1:4]
data.mclust<-Mclust(orig.data)
BIC.data<-as.data.frame(data.mclust$BIC)
BIC.data$NumComp<-rownames(BIC.data)
melted.BIC<-melt(BIC.data, var.ids= "NumComp")
ggplot(melted.BIC, aes(x=as.numeric(NumComp), y=value, colour=variable, group=variable))+
scale_x_continuous("Number of Components")+
scale_y_continuous("BIC")+
scale_colour_hue("")+
geom_point()+
geom_line()+
theme_bw()
#plotting correlation matrix of original data colored by model classification
#stripped from plotmatrix()
mapping=aes(colour=as.factor(data.mclust$classification), shape=as.factor(data.mclust$classification))
grid <- expand.grid(x = 1:ncol(orig.data), y = 1:ncol(orig.data))
grid <- subset(grid, x != y)
all <- do.call("rbind", lapply(1:nrow(grid), function(i) {
xcol <- grid[i, "x"]
ycol <- grid[i, "y"]
data.frame(xvar = names(orig.data)[ycol], yvar = names(orig.data)[xcol],
x = orig.data[, xcol], y = orig.data[, ycol], orig.data)
}))
all$xvar <- factor(all$xvar, levels = names(orig.data))
all$yvar <- factor(all$yvar, levels = names(orig.data))
densities <- do.call("rbind", lapply(1:ncol(orig.data), function(i) {
data.frame(xvar = names(orig.data)[i], yvar = names(orig.data)[i],
x = orig.data[, i])
}))
mapping <- defaults(mapping, aes_string(x = "x", y = "y"))
class(mapping) <- "uneval"
ggplot(all, mapping) +
facet_grid(xvar ~ yvar, scales = "free") +
geom_point(na.rm = TRUE) +
stat_density(aes(x = x, y = ..scaled.. * diff(range(x)) + min(x)), data = densities, position = "identity", colour = "grey20", geom = "line") +
opts(legend.postion= "none")
#cant get rid of the damn legend
#it would be a bunch faster if it only plotted 1 v 2 and not also 2 v 1... cest la vie
#scatter plot of width by length
#color and shape by classification
#center of cluster by data.mclust$parameters$mean
get.ellipses <- function(coords, mclust.fit){
centers <- mclust.fit$parameters$mean[coords, ]
vars <- mclust.fit$parameters$variance$sigma[coords, coords, ]
ldply(1:ncol(centers), function(cluster){
data.frame(ellipse(vars[,,cluster], centre = centers[, cluster],
level = 0.5), classification = cluster)
})
}
iris.el <- get.ellipses(c("Sepal.Length", "Sepal.Width"), data.mclust)
orig.data$classification <- data.mclust$classification
ggplot(orig.data, aes(Sepal.Length, Sepal.Width, colour = factor(classification))) +
geom_point(aes(shape = classification))+
geom_path(data = iris.el,
aes(group = classification, linetype = classification))
Getting there...
Brandon
Dennis,That's very slick. I was wondering how to go about using Charlotte's code to run the ellipse code on the other pairs, but it makes perfect sense to put them in the scatterplot matrix. I'm wondering about a few things.
- The density plots now represent the two different clusters in the iris dataset, does this make more sense than having a representation of the entire distribution?
- It'd be nice to eliminate the grid object either in the top right or bottom left half of the matrix, less plotting = less CPU time and you're not really getting more information with graphs 'A' vs 'B' and 'B' vs 'A', can you put empty grid objects in a pattern like that? I've seen it done with geom_tile() for a similar correlation matrix, but that's with one plot, not multiple ones like in this instance.
- Lastly, there's one more plot in the output of plot(Mclust()) ... "uncertainty"... which appears to be done by using scale_size and perhaps scale_alpha on geom_point() in the scatterplot. However, I'm really 'uncertain' as to where the levels for this scaling comes from.