Revision: 1fed53e6cdcd
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:23:42 2012
Log: Add support for Polyphen2 annotation
http://code.google.com/p/glu-genetics/source/detail?r=1fed53e6cdcd
Revision: be00a930f6c7
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:24:21 2012
Log: Minor updates to test code and debugging.
http://code.google.com/p/glu-genetics/source/detail?r=be00a930f6c7
Revision: b235a2fc969c
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:24:53 2012
Log: Make rtree support manadatory and update default genedb version.
http://code.google.com/p/glu-genetics/source/detail?r=b235a2fc969c
Revision: 48a8f2185676
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:25:43 2012
Log: Fix segment boundaries for multi-segment targets.
http://code.google.com/p/glu-genetics/source/detail?r=48a8f2185676
==============================================================================
Revision: 1fed53e6cdcd
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:23:42 2012
Log: Add support for Polyphen2 annotation
http://code.google.com/p/glu-genetics/source/detail?r=1fed53e6cdcd
Modified:
/glu/modules/seq/annotate.py
=======================================
--- /glu/modules/seq/annotate.py Wed Apr 18 09:30:57 2012
+++ /glu/modules/seq/annotate.py Sun Apr 22 13:23:42 2012
@@ -12,6 +12,8 @@
from operator import itemgetter
+import pysam
+
from glu.lib.utils import unique
from glu.lib.fileutils import list_reader, autofile, hyphen
from glu.lib.progressbar import progress_loop
@@ -113,11 +115,28 @@
return infomap
+def polyphen2_code(c):
+ if c=='?':
+ return '.'
+ elif len(c)==1:
+ return c
+ elif 'D' in c:
+ return'D'
+ elif 'd' in c:
+ return'd'
+ elif '?' in c:
+ return'.'
+ elif 'b' in c:
+ return'b'
+ else:
+ return'.'
+
+
def aa_change(e):
return '%s->%s' % (e.ref_aa,e.var_aa) if e.ref_aa or e.var_aa else ''
-def update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, options):
+def update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, polyphen2,
options):
new_info = []
if vs:
@@ -256,6 +275,7 @@
new_info.append('KAVIAR_NAMES=%s' % ','.join(ktext))
+
if refvars:
ingroup,outgroup = refvars.get(v.chrom,v.start,v.end,v.var) if refvars
else ([],[])
@@ -271,6 +291,43 @@
new_info.append('REFVAR_OUTGROUP_NAMES=%s' % outgroup)
v.filter.append('RefVar')
+ if vs and polyphen2 and v.end-v.start==1 and 'CDS' in location:
+ pmap = {'A':0,'C':1,'G':2,'T':3}
+
+ try:
+ pvars = [ p.rstrip().split('\t') for p in
polyphen2.fetch(v.chrom,v.start,v.end) ]
+ except ValueError:
+ pvars = None
+
+ if pvars:
+ hdivs = []
+ hvars = []
+
+ for a in v.var:
+ if a in pmap:
+ i = pmap[a]
+
+ hdiv = hvar = ''
+ for p in pvars:
+ hdiv += p[3][i]
+ hvar += p[4][i]
+
+ hdiv = polyphen2_code(hdiv)
+ hvar = polyphen2_code(hvar)
+ else:
+ hdiv = '.'
+ hvar = '.'
+
+ assert hdiv!='r' and hvar!='r'
+
+ hdivs.append(hdiv)
+ hvars.append(hvar)
+
+ if hdivs.count('.')!=len(hdivs):
+ new_info.append('POLYPHEN2_HDIV=%s' % (','.join(hdivs)))
+ if hvars.count('.')!=len(hvars):
+ new_info.append('POLYPHEN2_HVAR=%s' % (','.join(hvars)))
+
if 'tgp' in v.names:
v.names.remove('tgp')
v.filter.append('1000G')
@@ -292,6 +349,7 @@
vcf = VCFReader(options.variants,sys.stdin)
cv = CGFVariants(options.cgfvariants, options.reference) if
options.cgfvariants else None
esp = VCFReader(options.esp) if options.esp else None
+ polyphen2= pysam.Tabixfile(options.polyphen2) if options.polyphen2 else
None
if options.kaviar:
references = list_reader(options.reference+'.fai')
@@ -352,10 +410,14 @@
metadata['INFO'
].append('##INFO=<ID=REFVAR_INGROUP_NAMES,Number=.,Type=String,Description="Intra-group
samples in which Variant is present">')
metadata['INFO'
].append('##INFO=<ID=REFVAR_OUTGROUP_NAMES,Number=.,Type=String,Description="Extra-group
samples in which Variant is present">')
+ if polyphen2:
+ metadata['INFO'
].append('##INFO=<ID=POLYPHEN2_HDIV,Number=.,Type=String,Description="Polyphen2
HDIV prediction code for each variant SNV allele (b=benign, d=possibly
damaging, D=probably damaging, .=unknown)">')
+ metadata['INFO'
].append('##INFO=<ID=POLYPHEN2_HVAR,Number=.,Type=String,Description="Polyphen2
HVAR prediction code for each variant SNV allele (b=benign, d=possibly
damaging, D=probably damaging, .=unknown)">')
+
out = VCFWriter(options.output, metadata, vcf.samples, options.reference)
for v in vcf:
- update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, options)
+ update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, polyphen2,
options)
out.write_locus(v)
@@ -454,8 +516,8 @@
parser.add_argument('variants', help='Input variant file')
- parser.add_argument('-f', '--format', metavar='NAME', default='GLU',
- help='File format (VCF, GLU)')
+ parser.add_argument('-f', '--format', metavar='NAME', default='VCF',
+ help='File format (VCF)')
parser.add_argument('-g', '--genedb', metavar='NAME',
help='Genedb genome annotation database name or
file')
parser.add_argument('-r', '--reference', metavar='NAME', required=True,
@@ -470,6 +532,8 @@
help='Kaviar annotation (optional)')
parser.add_argument('--refvars', metavar='NAME',
help='Reference variant list')
+ parser.add_argument('--polyphen2', metavar='NAME',
+ help='Polyphen2 exome annotation (optional)')
parser.add_argument('--refingroup', metavar='NAME',
help='List of subjects defined to be intra-group
reference variants')
parser.add_argument('-o', '--output', metavar='FILE', default='-',
==============================================================================
Revision: be00a930f6c7
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:24:21 2012
Log: Minor updates to test code and debugging.
http://code.google.com/p/glu-genetics/source/detail?r=be00a930f6c7
Modified:
/glu/lib/seqlib/vannotator.py
=======================================
--- /glu/lib/seqlib/vannotator.py Wed Apr 18 09:30:25 2012
+++ /glu/lib/seqlib/vannotator.py Sun Apr 22 13:24:21 2012
@@ -205,7 +205,7 @@
if 0: # DEBUG
parts = self.decode_gene(gene)
for part in parts:
- if part.type not in ('intron','UTR5','UTR3') and '_' not in
part.chrom:
+ if part.type not in ('intron','UTR5','UTR3','UTR') and '_' not
in part.chrom:
print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol]))
sys.stderr.write('Loading complete.\n')
@@ -424,7 +424,7 @@
ref_frame = ref_cds[codon_start:codon_end]
ref_aa = ref_frame.translate()
- assert len(ref_aa)
+ #assert len(ref_aa)
result[-1] += ':aa=%d' % (aa_position+1)
result +=
[False,'SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(ref_aa)]
==============================================================================
Revision: b235a2fc969c
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:24:53 2012
Log: Make rtree support manadatory and update default genedb version.
http://code.google.com/p/glu-genetics/source/detail?r=b235a2fc969c
Modified:
/glu/lib/genedb/__init__.py
/glu/lib/genedb/queries.py
=======================================
--- /glu/lib/genedb/__init__.py Sun Apr 8 12:41:07 2012
+++ /glu/lib/genedb/__init__.py Sun Apr 22 13:24:53 2012
@@ -12,7 +12,7 @@
import sqlite3
-DEFAULT_GENEDB = ['genedb_hg19_snp135_rtree.db']
+DEFAULT_GENEDB = ['genedb_hg19_snp135_2012-04.db']
DEFAULT_PATHS = [os.path.join(os.path.dirname(__file__),'data'),
'/CGF/Resources/Data/genedb',
=======================================
--- /glu/lib/genedb/queries.py Sun Apr 8 12:41:07 2012
+++ /glu/lib/genedb/queries.py Sun Apr 22 13:24:53 2012
@@ -90,61 +90,38 @@
def query_gene_neighborhood(con,chrom,start,end,up,dn):
cur = con.cursor()
- if 'rtree' not in con.filename:
- sql = '''
- SELECT symbol,chrom,MIN(txStart) as start,MAX(txEnd) as end,strand
- FROM gene
- WHERE chrom = ?
- AND ((strand = '+' AND txStart<? AND txEnd>?)
- OR (strand = '-' AND txStart<? AND txEnd>?))
- GROUP BY symbol
- ORDER BY chrom,start,end,symbol,canonical DESC
- '''
- cur.execute(sql, (chrom,end+dn,start-up,end+up,start-dn))
- else:
- sql = '''
- SELECT symbol,chrom,MIN(txStart) as start,MAX(txEnd) as end,strand
- FROM gene
- WHERE chrom = ?
- AND id in (SELECT id
- FROM gene_index
- WHERE txStart < ?
- AND txEnd > ?)
- AND ((strand = '+' AND txStart<? AND txEnd>?)
- OR (strand = '-' AND txStart<? AND txEnd>?))
- GROUP BY symbol
- ORDER BY chrom,start,end,symbol,canonical DESC
- '''
- d = max(dn,up)
- cur.execute(sql, (chrom,end+d,start-d,end+dn,start-up,end+up,start-dn))
+ sql = '''
+ SELECT symbol,chrom,MIN(txStart) as start,MAX(txEnd) as end,strand
+ FROM gene
+ WHERE chrom = ?
+ AND id in (SELECT id
+ FROM gene_index
+ WHERE txStart < ?
+ AND txEnd > ?)
+ AND ((strand = '+' AND txStart<? AND txEnd>?)
+ OR (strand = '-' AND txStart<? AND txEnd>?))
+ GROUP BY symbol
+ ORDER BY chrom,start,end,symbol,canonical DESC
+ '''
+ d = max(dn,up)
+ cur.execute(sql, (chrom,end+d,start-d,end+dn,start-up,end+up,start-dn))
return cur.fetchall()
def query_genes_by_location(con,chrom,start,end):
- if 'rtree' not in con.filename:
- sql = '''
- SELECT symbol as alias,symbol,chrom,MIN(txStart) as start,MAX(txEnd)
as end,strand,"GENE"
- SELECT symbol,symbol,chrom,MIN(txStart) as start, MAX(txEnd) as
end,strand
- FROM gene
- WHERE chrom = ?
- AND txStart<? AND txEnd>?
- GROUP BY symbol
- ORDER BY chrom,start,end,symbol,canonical DESC
- '''
- else:
- sql = '''
- SELECT symbol as alias,symbol,chrom,MIN(txStart) as start,MAX(txEnd)
as end,strand,"GENE"
- SELECT symbol,symbol,chrom,MIN(txStart) as start, MAX(txEnd) as
end,strand
- FROM gene
- WHERE chrom = ?
- AND id in (SELECT id
- FROM gene_index
- WHERE txStart < ?
- AND txEnd > ?)
- GROUP BY symbol
- ORDER BY chrom,start,end,symbol,canonical DESC
- '''
+ sql = '''
+ SELECT symbol as alias,symbol,chrom,MIN(txStart) as start,MAX(txEnd)
as end,strand,"GENE"
+ SELECT symbol,symbol,chrom,MIN(txStart) as start, MAX(txEnd) as
end,strand
+ FROM gene
+ WHERE chrom = ?
+ AND id in (SELECT id
+ FROM gene_index
+ WHERE txStart < ?
+ AND txEnd > ?)
+ GROUP BY symbol
+ ORDER BY chrom,start,end,symbol,canonical DESC
+ '''
cur = con.cursor()
cur.execute(sql, (chrom, end, start))
@@ -337,26 +314,16 @@
def query_snps_by_location(con,chrom,start,end):
cur = con.cursor()
- if 'rtree' not in con.filename:
- sql = '''
- SELECT
name,chrom,start,end,strand,refAllele,alleles,vclass,func,weight
- FROM snp
- WHERE chrom = ?
- AND start < ?
- AND end > ?
- ORDER BY start,name;
- '''
- else:
- sql = '''
- SELECT
name,chrom,start,end,strand,refAllele,alleles,vclass,func,weight
- FROM snp
- WHERE chrom = ?
- AND id in (SELECT id
- FROM snp_index
- WHERE start < ?
- AND end > ?)
- ORDER BY start,name;
- '''
+ sql = '''
+ SELECT name,chrom,start,end,strand,refAllele,alleles,vclass,func,weight
+ FROM snp
+ WHERE chrom = ?
+ AND id in (SELECT id
+ FROM snp_index
+ WHERE start < ?
+ AND end > ?)
+ ORDER BY start,name;
+ '''
cur.execute(sql,(chrom,end,start))
return cur.fetchall()
==============================================================================
Revision: 48a8f2185676
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Sun Apr 22 13:25:43 2012
Log: Fix segment boundaries for multi-segment targets.
http://code.google.com/p/glu-genetics/source/detail?r=48a8f2185676
Modified:
/glu/modules/seq/coverage.py
=======================================
--- /glu/modules/seq/coverage.py Mon Aug 1 10:24:11 2011
+++ /glu/modules/seq/coverage.py Sun Apr 22 13:25:43 2012
@@ -280,7 +280,10 @@
target_dict = {}
for target_contig,ctargets in targets.iteritems():
for target_start,target_end,target_name in ctargets:
- target_dict[target_contig,target_name] = target_start,target_end
+ if (target_contig,target_name) in target_dict:
+ target_dict[target_contig,target_name] = '',''
+ else:
+ target_dict[target_contig,target_name] = target_start,target_end
out = table_writer(options.targetout)