I'm working with 120 samples divided into 6 groups of different sizes. Due to the large number of samples, applying strict filtering during the unite() step causes significant data loss (many CpG loci are removed). To retain more loci, I'm using a less restrictive min.per.group value during merging.
My plan is to apply pool() afterward to create group-wise methylation profiles.
Here’s a simplified example of my code (actual data is larger, but the structure is similar):
#!/usr/bin/env Rscript
# 11/06/25
# Enfoque: se pretende ver si se pueden hacer los pools, uso las muestras del 2003
library('methylKit')
src0 <- "/data2/gonzalo2/alvaro_data2/RRBShistSS/prueba_mapeo_error/methylKit/entire_lib0"
src1 <- "/data2/gonzalo2/alvaro_data2/RRBShistSS/prueba_mapeo_error/methylKit/entire_lib1"
src2 <- "/data2/gonzalo2/alvaro_data2/RRBShistSS/prueba_mapeo_error/methylKit/entire_lib2"
src3 <- "/data2/gonzalo2/alvaro_data2/RRBShistSS/prueba_mapeo_error/methylKit/entire_lib3"
file.list <- list(
paste0(src0, "/SELLA03_12026_CpG.txt"),
paste0(src0, "/SELLA03_1536_CpG.txt"),
paste0(src0, "/SELLA03_1537_CpG.txt"),
paste0(src0, "/SELLA03_9261_CpG.txt"),
paste0(src1, "/SELLA03_9160_CpG.txt"),
paste0(src1, "/SELLA03_12082_CpG.txt"),
paste0(src2, "/SELLA03_1541_CpG.txt"),
paste0(src2, "/SELLA03_9262_CpG.txt"),
paste0(src3, "/SELLA03_1542_CpG.txt"),
paste0(src3, "/SELLA03_12022_CpG.txt"),
paste0(src3, "/SELLA03_12054_CpG.txt"),
paste0(src3, "/SELLA03_9278_CpG.txt"), # Muestras del Sella03
paste0(src0, "/NAR02_10352_CpG.txt"),
paste0(src0, "/NAR03_10741_CpG.txt"),
paste0(src0, "/NAR03_10769_CpG.txt"),
paste0(src0, "/NAR03_2097_CpG.txt"),
paste0(src1, "/NAR03_2032_CpG.txt"),
paste0(src3, "/NAR04_12_CpG.txt") # Muestras del NAR03
)
myobj_03_2pool <- methRead(file.list,
sample.id =
list(
"SELLA03_12026",
"SELLA03_1536",
"SELLA03_1537",
"SELLA03_9261",
"SELLA03_9160",
"SELLA03_12082",
"SELLA03_1541",
"SELLA03_9262",
"SELLA03_1542",
"SELLA03_12022",
"SELLA03_12054",
"SELLA03_9278" ,
"NAR02_10352",
"NAR03_10741",
"NAR03_10769",
"NAR03_2097",
"NAR03_2032",
"NAR04_12"),
treatment = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),
assembly = "Ssal_v3.1",
context = "CpG",
mincov = 3
)
filt.obj_03_2pool=filterByCoverage(myobj_03_2pool,lo.count=3,lo.perc=NULL,
hi.count=NULL,hi.perc=99.9)
nor.obj_03_2pool=normalizeCoverage(filt.obj_03_2pool)
tiles = tileMethylCounts(nor.obj_03_2pool,win.size=1000,step.size=1000,cov.bases = 10)
meth.min=unite(tiles, min.per.group=1L,destrand=TRUE)
pooled.03=pool(meth.min,sample.ids=c("SELLA03","NAR03"))
myDiff=calculateDiffMeth(pooled.03)
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01) # methylDiff object with 3910 rows
save(nor.obj_03_2pool, tiles, meth.min, pooled.03, myDiff25p.hyper, myDiff25p.hypo, myDiff25p, file = "/home/alvaro/RRBShist.all/scripts/methylKit/cov_3/pools/obj_03_pool.RData")