> #annotate genes to transcripts > mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl", + host = 'www.ensembl.org') > > t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "transcript_version", + "ensembl_gene_id", "external_gene_name", "description", + "transcript_biotype"), mart = mart) > > t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, + ens_gene = ensembl_gene_id, ext_gene = external_gene_name) > library(sleuth) > base_dir <- "../rnaseq/nalm6/ideala/alt/" > > sample_id <- dir(file.path(base_dir, "results")) > > kal_dirs <- sapply(sample_id, function(id) file.path(base_dir, "results", id, "output")) > > s2c.a.f <- read.table("../rnaseq/nalm6/ideala/design_alt_full.txt", + header = TRUE, stringsAsFactors = FALSE) > > s2c.a.f <- dplyr::mutate(s2c.a.f, path = kal_dirs) > > # Prep object > > so.full <- sleuth_prep(s2c.a.f, + target_mapping = t2g, + aggregation_column = 'ens_gene', + extra_bootstrap_summary = TRUE) reading in kallisto results dropping unused factor levels .................. Error in check_target_mapping(tmp_names, target_mapping) : couldn't solve nonzero intersection
base_dir <- "../rnaseq/nalm6/ideala/alt/"
sample_id <- dir(file.path(base_dir, "results"))
kal_dirs <- sapply(sample_id, function(id) file.path(base_dir, "results", id, "output"))
test_kal <- sleuth::read_kallisto(kal_dirs[1], read_bootstrap = F)
head(test_kal$abundance$target_id)
[1] "ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|DDX11L1-202|DDX11L1|1657|processed_transcript|"
[2] "ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|DDX11L1-201|DDX11L1|632|transcribed_unprocessed_pseudogene|"
[3] "ENST00000488147.1|ENSG00000227232.5|OTTHUMG00000000958.1|OTTHUMT00000002839.1|WASH7P-201|WASH7P|1351|unprocessed_pseudogene|"
[4] "ENST00000619216.1|ENSG00000278267.1|-|-|MIR6859-1-201|MIR6859-1|68|miRNA|"
[5] "ENST00000473358.1|ENSG00000243485.5|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-2HG-202|MIR1302-2HG|712|lincRNA|"
[6] "ENST00000469289.1|ENSG00000243485.5|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-2HG-201|MIR1302-2HG|535|lincRNA|"
The first few lines of my target_mapping file used in sleuth_prep is as follows:
target_id ens_gene ext_gene 1 ENST00000385229.3 ENSG00000283891.1 MIR628 2 ENST00000516122.1 ENSG00000251931.1 RNU6-871P 3 ENST00000385032.1 ENSG00000207766.1 MIR626 4 ENST00000618751.3 ENSG00000275323.3 RPS9 5 ENST00000630768.1 ENSG00000275323.3 RPS9 6 ENST00000610995.1 ENSG00000276678.1 GHRLOS
I think the issue is that I used the gencode v27 annotation for running kallisto and I am trying to use the ensembl 90 (which should be equivalent to gencode v27) to create my target_mapping file for the sleuth analysis. There must be some slight difference between gencode v27 and ensembl 90 that is not apparent to me.Is there a way that I can either use the gencode v27 annotation in the target_mapping file or convert the ensembl 90 to gencode v27? I would think that this should solve the issue. As a person new to computational biology, I would greatly appreciate any assistance.Thank you,Sudarshan Iyer
library(rhdf5)
files <- list.files(<insert working directory>, pattern=".h5", recursive=TRUE, full.names=TRUE)
for (currentFile in files) {
oldids <- h5read(currentFile, "/aux/ids")
newids <- gsub("\\|.*", "", oldids)
h5write(newids, currentFile, "/aux/ids")
}