Plotting Histograms and a Logistic Curve on the Same Plot

461 views
Skip to first unread message

Andrew Landgraf

unread,
Jun 17, 2013, 9:35:16 AM6/17/13
to ggp...@googlegroups.com
The function loghistplot at http://schamberlain.github.io/2012/01/logistic-regression-barplot-fig/ is supposed to plot a logistic regression curve and, instead of plotting points, it plots a histogram for the 0's and a histogram for the 1's. It looks like he created 3 plots and somehow superimposed them on top of each other using viewports. I recently tried running it again and it was not working. Does anyone know what is wrong or a different way to achieve the same plot? I do not know anything about the grid package. Thanks.

Here is the code with more up to date ggplot2 syntax.

loghistplot <- function(data) {
 
  require(ggplot2); require(gridExtra) # load packages
 
  names(data) <- c('x','y') # rename columns
 
  # get min and max axis values
  min_x <- min(data$x)
  max_x <- max(data$x)
  min_y <- min(data$y)
  max_y <- max(data$y)
 
  # get bin numbers
  bin_no <- max(hist(data$x)$counts) + 5
 
  # create plots
  a <- ggplot(data, aes(x = x, y = y)) +
    theme_bw(base_size=16) +
    geom_smooth(method = "glm", family = "binomial", se = TRUE,
                colour='black', size=1.5, alpha = 0.3) +
    # scale_y_continuous(limits=c(0,1), breaks=c(0,1)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),
         panel.grid.minor=element_blank(),
         panel.background = element_blank()) +
    labs(y = "Probability\n", x = "\nYour X Variable")
 
  b <- ggplot(data[data$y == unique(data$y)[1], ], aes(x = x)) +
    theme_bw(base_size=16) +
    geom_histogram(fill = "grey") +
    scale_y_continuous(limits=c(0,bin_no)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),
         panel.grid.minor=element_blank(),
         axis.text.y = element_blank(),
         axis.text.x = element_blank(),
         axis.ticks = element_blank(),
         panel.border = element_blank(),
         panel.background = element_blank()) +
    labs(y='\n', x='\n')
 
  c <- ggplot(data[data$y == unique(data$y)[2], ], aes(x = x)) +
    theme_bw(base_size=16) +
    geom_histogram(fill = "grey") +
    scale_y_continuous(trans='reverse', limits=c(bin_no,0)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),panel.grid.minor=element_blank(),
         axis.text.y = element_blank(), axis.text.x = element_blank(),
         axis.ticks = element_blank(),
         panel.border = element_blank(),
         panel.background = element_blank()) +
    labs(y='\n', x='\n')
 
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(1,1)))
 
  vpa_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)
  vpb_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)
  vpc_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)
 
  print(b, vp = vpb_)
  print(c, vp = vpc_)
  print(a, vp = vpa_)
}
loghistplot(mtcars[,c("mpg","vs")])

Andrew Landgraf

unread,
Jun 17, 2013, 10:20:46 AM6/17/13
to ggp...@googlegroups.com
I guess I should also say that I am using 64 bit R 3.0.0 on Windows 7 with RStudio 0.97.336. I also ran it without RStudio and on 32 bit R and it had the same issue.

Scott Chamberlain

unread,
Jun 17, 2013, 11:30:18 AM6/17/13
to ggp...@googlegroups.com
I'll take a look and get back to you soon. 

Scott

Andrew Landgraf

unread,
Jun 17, 2013, 1:33:50 PM6/17/13
to ggp...@googlegroups.com
It seems related to this problem. I tried making the panel.background = element_rect(fill = NA) and it still exhibited the same behavior.

BTW, I forgot to mention the most important part. I am using  ggplot2_0.9.3.1.

Scott Chamberlain

unread,
Jun 17, 2013, 5:44:54 PM6/17/13
to ggp...@googlegroups.com
Sorry about that. It is fixed below, or a gist here:  https://gist.github.com/SChamberlain/5800746

#' Make combined histogram/line plot for visualizing logistic regression.
#' 
#' @import ggplot2 gridExtra grid
#' @param data
#' @examples
#' loghistplot(mtcars[,c("mpg","vs")])
#' loghistplot(movies[,c("rating","Action")])
loghistplot <- function(data)
{    
  names(data) <- c('x','y') # rename columns
  
  # get min and max axis values
  min_x <- min(data$x)
  max_x <- max(data$x)
  min_y <- min(data$y)
  max_y <- max(data$y)
  
  # get bin numbers
  bin_no <- max(hist(data$x)$counts) + 5
  
  # create plots
  a <- ggplot(data, aes(x = x, y = y)) +
    theme_bw(base_size=16) +
    geom_smooth(method = "glm", family = "binomial", se = TRUE,
                colour='black', size=1.5, alpha = 0.3) +
    # scale_y_continuous(limits=c(0,1), breaks=c(0,1)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor=element_blank(),
          plot.background = element_blank(),
          panel.background = element_rect(fill =  NA), 
          panel.border = element_rect(fill=NA)) +
    labs(y = "Probability\n", x = "\nYour X Variable") 
  
  b <- ggplot(data[data$y == unique(data$y)[1], ], aes(x = x)) +
    theme_bw(base_size=16) +
    geom_histogram(fill = "grey") +
    scale_y_continuous(limits=c(0,bin_no)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),panel.grid.minor=element_blank(),
          axis.text.y = element_blank(), axis.text.x = element_blank(),
          axis.ticks = element_blank(),
          panel.border = element_blank(),
          plot.background = element_blank(),
          panel.background = element_rect(fill =  NA), 
          panel.border = element_rect(fill=NA)) +
    labs(y='\n', x='\n')
  
  c <- ggplot(data[data$y == unique(data$y)[2], ], aes(x = x)) +
    theme_bw(base_size=16) +
    geom_histogram(fill = "grey") +
    scale_y_continuous(trans='reverse', limits=c(bin_no,0)) +
    scale_x_continuous(limits=c(min_x,max_x)) +
    theme(panel.grid.major = element_blank(),panel.grid.minor=element_blank(),
          axis.text.y = element_blank(), axis.text.x = element_blank(),
          axis.ticks = element_blank(),
          panel.border = element_blank(),
          plot.background = element_blank(),
          panel.background = element_rect(fill =  NA), 
          panel.border = element_rect(fill=NA)) +
    labs(y='\n', x='\n')
  
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(1,1)))
  
  vpa_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)
  vpb_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)
  vpc_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.5)

  # these three used to lay on top of one another, no 
  print(b, vp = vpc_)
  print(c, vp = vpa_)
  print(a, vp = vpb_) 

Andrew Landgraf

unread,
Jun 18, 2013, 11:21:40 AM6/18/13
to ggp...@googlegroups.com
Thanks Scott.

I actually attempted to build the same type of plot a different way. I plot a logistic regression curve, and then I manually add histograms using geom_bar. So this is all one ggplot object, instead of 3 plots overlain. To create the histograms, I used hist to do the binning, and then scaled them to fit the scale of the logistic plot. It may be a little slow if you have lots of data because it creates the plot twice: once to get the scale and once for the final plot. I also included a non-linear spline fit because that is the type of thing I like to look at.

I also wrote it so it returns a ggplot object that you can add more layers to.

Here is the function (http://gist.github.com/andland/5806238):

logithistplot <- function(data,breaks="Sturges",se=TRUE) {
    require(ggplot2);
    col_names=names(data)

  
    # get min and max axis values
    min_x <- min(data[,1])
    max_x <- max(data[,1])
  
    # get bin numbers
    hist_both <- hist(data[,1],plot=FALSE,breaks=breaks)
    hist0 <- hist(data[data[,2]==0,1],breaks=hist_both$breaks,plot=FALSE)
    hist1 <- hist(data[data[,2]==1,1],breaks=hist_both$breaks,plot=FALSE)
    max_count=max(c(hist0$counts,hist1$counts))
  
    # create plots
    a <- ggplot(data, aes_string(x = col_names[1], y = col_names[2])) +
      theme_bw(base_size=16) +
      geom_smooth(method = "gam", family = "binomial", se = se,
                  aes(colour='Spline',fill='Spline'), size=1.5,formula=y~s(x,bs="cr"))+
      geom_smooth(method = "glm", family = "binomial", se = se,
                  aes(colour='Logit',fill='Logit'), size=1.5) +

      # scale_y_continuous(limits=c(0,1), breaks=c(0,1)) +
      scale_x_continuous(limits=c(min_x,max_x)) +
      theme(panel.grid.major = element_blank(),
            panel.grid.minor=element_blank(),
            panel.background = element_blank()) +
      guides(fill=FALSE)+
      labs(y = "Probability\n", x = paste0("\n",col_names[1]), colour="Method")
  
    y_range_orig=ggplot_build(a)$panel$ranges[[1]]$y.range
    y_range=y_range_orig
    if (y_range[1]<0) y_range[1]=0
    if (y_range[2]>1) y_range[2]=1
    min_y <- min(y_range[1])
    max_y <- max(y_range[2])
  
    # need blank bar up to y_min, bars for 0's, blank bars up to 1's, bars for 1's up to y_max
    hist_df=data.frame(mids=hist_both$mids,
                       counts_below=min_y,
                       counts0=hist0$counts/max_count*diff(y_range)/2,
                       counts_mid=diff(y_range)-hist_both$counts/max_count*diff(y_range)/2,
                       counts1=hist1$counts/max_count*diff(y_range)/2)
    hist.m=melt(hist_df,id=1)
    head(hist.m)
    ggplot()+geom_bar(data=hist.m,aes(mids,value,fill=variable),stat="identity")+coord_cartesian(ylim=y_range)+scale_fill_manual(values=c(NA, "grey", NA,"grey"))+guides(fill=FALSE)
  
    (a <- ggplot(data, aes_string(x = col_names[1], y = col_names[2])) +
      theme_bw(base_size=16) +
      geom_bar(data=hist.m,aes(mids,value,fill=variable),stat="identity")+
      coord_cartesian(ylim=y_range_orig)+
      scale_fill_manual(values=c(NA,NA,"grey","grey","red","blue"))+
      geom_smooth(method = "gam", family = "binomial", se = se,
                  aes(colour='Spline',fill='Spline'), size=1.5,formula=y~s(x,bs="cr"))+
      geom_smooth(method = "glm", family = "binomial", se = se,
                  aes(colour='Logit',fill='Logit'), size=1.5) +
      guides(fill=FALSE)+
      labs(y = "Probability\n", x = paste0("\n",col_names[1]), colour="Method"))
    a
}
# Examples
# logithistplot(mtcars[,c("mpg","vs")],breaks=10,se=FALSE) + ggtitle("Cars Data")
# logithistplot(movies[,c("rating","Action")])+labs(y="Probability of Being an Action Movie")
Reply all
Reply to author
Forward
0 new messages