###################################################
### code chunk number 7: intensityFiltering
###################################################
# ** user-defined groups; default: groups defined by the treatment column in SampleInformation.txt
sample.info$number <- seq(1, nrow(sample.info))
groups <- split(sample.info$number, sample.info$treatment)
# check whether the filtering is already performed, perform it otherwise
# at probeset level
if(file.exists(file = paste(output.folder, "/PresentProbesets", ds, ".Rdata", sep = ""))){
load(file = paste(output.folder, "/PresentProbesets", ds, ".Rdata", sep = ""))
} else {
# remove cross-hybridising probesets
non.crosshyb.probesets <- probesets.NetAffx.32$probeset_id[probesets.NetAffx.32$crosshyb_type == 1]
if(!exists("exFit")){
load(file= paste(output.folder, "CoreExonIntensities.Rdata", sep = "/"))
}
exon.intensities <- exFit[exFit$groupName %in% non.crosshyb.probesets, -c(3:5)]
rm("exFit")
gc()
# ** user-defined criteria for absent/present probesets
# ** criterion 1: probeset absent/present in a group of samples: present in at least half of the samples
present.exons <- lapply(groups, FUN = function(group){
if(length(group) > 1){
apply(log2(exon.intensities[, group + 2]) < 3, 1, sum)/length(group) < 0.5
} else {
log2(exon.intensities[, group + 2]) > 3
}
})
present.exons <- t(do.call(rbind, present.exons)) # convert to dataframe
rownames(present.exons) <- exon.intensities$groupName # use probeset identities
# ** criterion 2: probeset absent/present in the dataset
# remove probesets not present in any of the groups
absent.exons <- apply(present.exons, 1, AllFalse)
probesets.to.keep <- absent.exons[absent.exons == FALSE]
probesets.to.keep <- as.factor(names(probesets.to.keep))
n.present.exons <- length(probesets.to.keep)
rm(list = c("absent.exons"))
save(probesets.to.keep, file = paste(output.folder, "/PresentProbesets", ds, ".Rdata", sep = ""))
}
# check whether transcript filtering has been performed; perform it if not
if(file.exists(file = paste(output.folder, "/PresentTranscripts", ds, ".Rdata", sep = ""))){
load(file = paste(output.folder, "/PresentTranscripts", ds, ".Rdata", sep = ""))
} else {
if(!exists("exon.intensities")){
load(file= paste(output.folder, "CoreExonIntensities.Rdata", sep = "/"))
exon.intensities <- exFit[exFit$groupName %in% non.crosshyb.probesets, -c(3:5)]
rm("exFit")
gc()
}
# ** user-defined criteria for absent transcripts:
# criterion 1: half or more of probesets of transcript present in sample
# --> transcript present in sample
core.transcripts <- unique(exon.intensities$unitName)
# create a list of transcript clusters present/absent in each sample
present.genes <- lapply(core.transcripts, FUN = function(gene){
apply(log2(exon.intensities[exon.intensities$unitName == gene, -c(1:2)]) < 3, 2, sum)/
length(exon.intensities[exon.intensities$unitName == gene,]$groupName) < 0.5
})# FALSE: gene not present in sample
names(present.genes) <- core.transcripts
present.genes <- do.call(rbind, present.genes) # convert to logical matrix
# criterion 2: transcript present in at least half of the samples of a group
# --> transcript present in the group
present.genes.in.group <- lapply(groups, FUN = function(group){
if(length(group) > 1){
apply(present.genes[ , group], 1, sum)/length(group) >= 0.5
} else {
present.genes[ , group]
}
})
present.genes.in.group <- do.call(rbind, present.genes.in.group) # logical matrix
present.genes.in.group <- t(present.genes.in.group)
# keep genes only present in at least two groups
transcripts.to.keep <- apply(present.genes.in.group, 1, TwoOrMoreTrue)
transcripts.to.keep <- names(transcripts.to.keep[transcripts.to.keep == TRUE])
save(transcripts.to.keep, file = paste(output.folder, "/PresentTranscripts", ds, ".Rdata", sep = ""))
rm("exon.intensities")
gc()
}
n.present.transcript.clusters <- length(transcripts.to.keep)