Revision: 1093c5c364cb
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:25:18 2012
Log: Add tabix query capability to VCF reader
http://code.google.com/p/glu-genetics/source/detail?r=1093c5c364cb
Revision: d8a4abd2a02b
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:30:25 2012
Log: Use +-5 bp for possible splice variation. Use UTR5 and UTR3
instead o...
http://code.google.com/p/glu-genetics/source/detail?r=d8a4abd2a02b
Revision: 96ed9a084b22
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:30:57 2012
Log: Add support for UW Exome Sequencing Project (ESP) annotation.
http://code.google.com/p/glu-genetics/source/detail?r=96ed9a084b22
==============================================================================
Revision: 1093c5c364cb
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:25:18 2012
Log: Add tabix query capability to VCF reader
http://code.google.com/p/glu-genetics/source/detail?r=1093c5c364cb
Modified:
/glu/lib/seqlib/vcf.py
=======================================
--- /glu/lib/seqlib/vcf.py Sun Apr 8 12:41:07 2012
+++ /glu/lib/seqlib/vcf.py Wed Apr 18 09:25:18 2012
@@ -20,11 +20,53 @@
VCFRecord = recordtype('VCFRecord', 'chrom start end names ref var qual
filter info format genos')
+strcache = {}
+def _intern(x,strcache=strcache.setdefault): return strcache(x,x)
+
+
+def _encode_vcf_records(rows):
+ for row in rows:
+ chrom = _intern(row[0])
+ start = int(row[1])-1
+ names = row[2].split(';') if row[2]!='.' else []
+ ref = _intern(row[3])
+ end = start+len(ref)
+ var = [ _intern(v) for v in row[4].split(',') ]
+ qual = row[5]
+ filter = [ _intern(f) for f in row[6].split(';') ] if row[6]!='.'
else []
+ info = row[7].split(';') if row[7]!='.' else []
+
+ n = len(row)
+ if n>8:
+ format = _intern(row[8])
+
+ if n>9:
+ if ':' in format:
+ genos = [ g.split(':') for g in row[9:] ] if len(row)>9
else None
+ else:
+ genos = [ [g] for g in row[9:] ] if len(row)>9
else None
+ else:
+ genos = None
+ else:
+ format = genos = None
+
+ # VCF codes indels with an extra left reference base, which we strip
+ r = ref[0]
+ if all(a.startswith(r) for a in var):
+ start += 1
+ ref = _intern(ref[1:])
+ var = [ _intern(v[1:]) for v in var ]
+
+ yield
VCFRecord(chrom,start,end,names,ref,var,qual,filter,info,format,genos)
+
+
class VCFReader(object):
def __init__(self, filename, hyphen=None, field_size_limit=1024*1024):
if csv.field_size_limit()<field_size_limit:
csv.field_size_limit(field_size_limit)
+ self.filename = filename
+ self.tabixfile = None
self.data = data = table_reader(filename,hyphen=sys.stdin)
self.metadata = metadata = OrderedDict()
@@ -57,34 +99,17 @@
def __iter__(self):
- strcache = {}
- def intern(x,strcache=strcache.setdefault): return strcache(x,x)
-
- for row in self.data:
- chrom = intern(row[0])
- start = int(row[1])-1
- names = row[2].split(';') if row[2]!='.' else []
- ref = intern(row[3])
- end = start+len(ref)
- var = [ intern(v) for v in row[4].split(',') ]
- qual = row[5]
- filter = [ intern(f) for f in row[6].split(';') ] if row[6]!='.'
else []
- info = row[7].split(';')
- format = intern(row[8])
-
- if ':' in format:
- genos = [ g.split(':') for g in row[9:] ] if len(row)>9 else
None
- else:
- genos = [ [g] for g in row[9:] ] if len(row)>9 else
None
-
- # VCF codes indels with an extra left reference base, which we strip
- r = ref[0]
- if all(a.startswith(r) for a in var):
- start += 1
- ref = intern(ref[1:])
- var = [ intern(v[1:]) for v in var ]
-
- yield
VCFRecord(chrom,start,end,names,ref,var,qual,filter,info,format,genos)
+ return _encode_vcf_records(self.data)
+
+ def fetch(self, chromosome, start, stop):
+ tabixfile = self.tabixfile
+
+ if tabixfile is None:
+ self.tabixfile = tabixfile =
pysam.Tabixfile(self.filename,cache_size=128*1024*1024)
+
+ records = [ r.split('\t') for r in tabixfile.fetch(chromosome, start,
stop) ]
+
+ return _encode_vcf_records(records)
class VCFWriter(object):
==============================================================================
Revision: d8a4abd2a02b
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:30:25 2012
Log: Use +-5 bp for possible splice variation. Use UTR5 and UTR3
instead of more
complex "5' UTR" and "3' UTR" tags in output. Better define non-coding
transcripts with bare UTR tags instead of incorrectly/confusingly showing
them as "5' UTR".
http://code.google.com/p/glu-genetics/source/detail?r=d8a4abd2a02b
Modified:
/glu/lib/seqlib/vannotator.py
=======================================
--- /glu/lib/seqlib/vannotator.py Sun Apr 8 12:41:07 2012
+++ /glu/lib/seqlib/vannotator.py Wed Apr 18 09:30:25 2012
@@ -76,14 +76,14 @@
# increasing order and is bordered by the 5' UTR to the "left" and 3'
# UTR to the "right" in genome orientation.
exon_nums = count(1)
- left,right = "5' UTR","3' UTR"
+ left,right = 'UTR5','UTR3'
intron_offset = -1
else:
# Otherwise, genes transcribed on the reverse genomic strand are
# numbered in increasing order and is bordered by the 3' UTR to the
# "left" and 5' UTR to the "right" in genome orientation.
exon_nums = xrange(len(starts),0,-1)
- left,right = "3' UTR","5' UTR"
+ left,right = 'UTR3','UTR5'
intron_offset = 0
if gene.category=='noncoding' or cds_start==cds_end:
@@ -202,6 +202,12 @@
for gene in trans:
feature_map[gene.chrom].insert(gene.txStart,gene.txEnd,gene)
+ if 0: # DEBUG
+ parts = self.decode_gene(gene)
+ for part in parts:
+ if part.type not in ('intron','UTR5','UTR3') and '_' not in
part.chrom:
+
print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol]))
+
sys.stderr.write('Loading complete.\n')
@@ -291,9 +297,9 @@
parts = set(intersect)
mut_type = set()
- for splice in gene_parts.find_values(ref_start-10,ref_end+10):
+ for splice in gene_parts.find_values(ref_start-5,ref_end+5):
if splice.type=='CDS' or 'UTR' in splice.type:
- if (0<splice.start-ref_end<=10) or (0<ref_start-splice.end<=10):
+ if (0<splice.start-ref_end<=5) or (0<ref_start-splice.end<=5):
mut_type.add('POSSIBLE INTRONIC SPLICE VARIANT')
parts = ','.join(sorted(parts))
@@ -307,7 +313,7 @@
evidence.append([parts,gene,'',True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'',''])
elif mut_type:
evidence.append([parts,gene,'',True,'PREDICTED-DISRUPT-TRANSCRIPT',mut_type,ref_nuc,var_nuc,'',''])
- elif len(intersect["5' UTR"])+len(intersect["3' UTR"]):
+ elif len(intersect['UTR5'])+len(intersect['UTR3']):
evidence.append([parts,gene,'',False,'UNKNOWN-UTR',mut_type,ref_nuc,var_nuc,'',''])
elif len(intersect['intron']):
evidence.append([parts,gene,'',False,'UNKNOWN-INTRONIC',mut_type,ref_nuc,var_nuc,'',''])
==============================================================================
Revision: 96ed9a084b22
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Apr 18 09:30:57 2012
Log: Add support for UW Exome Sequencing Project (ESP) annotation.
http://code.google.com/p/glu-genetics/source/detail?r=96ed9a084b22
Modified:
/glu/modules/seq/annotate.py
=======================================
--- /glu/modules/seq/annotate.py Sun Apr 8 12:41:07 2012
+++ /glu/modules/seq/annotate.py Wed Apr 18 09:30:57 2012
@@ -101,11 +101,23 @@
return None
+def make_infomap(info):
+ infomap = {}
+ for inf in info:
+ if '=' in inf:
+ key,value = inf.split('=',1)
+ else:
+ key,value = inf,''
+
+ infomap[key] = value
+ return infomap
+
+
def aa_change(e):
return '%s->%s' % (e.ref_aa,e.var_aa) if e.ref_aa or e.var_aa else ''
-def update_vcf_annotation(v, vs, cv, kaviar, refvars, options):
+def update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, options):
new_info = []
if vs:
@@ -158,7 +170,7 @@
new_info.append('GENE_FUNCTION_DETAILS=%s' % (','.join(nsinfo )))
if segdups:
- segdup_info = ('%s:%s-%s:%.2f' %
(s.other_chrom,s.other_start,s.other_stop,s.matchFraction*100) for s in
segdups)
+ segdup_info = ('chr%s:%s-%s:%.2f' %
(s.other_chrom,s.other_start,s.other_stop,s.matchFraction*100) for s in
segdups)
segdup_info = ','.join(segdup_info)
v.filter.append('SegDup')
new_info.append('SEGDUP_COUNT=%d' % len(segdups))
@@ -190,6 +202,42 @@
if cvinfo.common_score>options.commonscore:
v.filter.append('Common')
+ if esp:
+ chrom = v.chrom
+ vars = set(v.var)
+ if chrom.startswith('chr'):
+ chrom = chrom[3:]
+
+ try:
+ esp_info = esp.fetch(chrom,v.start,v.end)
+ esp_info = [ e for e in esp_info if e.start==v.start and
e.end==v.end and vars.intersection(e.var) ]
+ except ValueError:
+ esp_info = None
+
+ if esp_info:
+ gtc = maf_ea = maf_aa = 0
+
+ for einfo in esp_info:
+ infomap = make_infomap(einfo.info)
+ if 'MAF' in infomap:
+ mafs = map(float,infomap['MAF'].split(','))
+ maf_ea = max(maf_ea,mafs[0]/100)
+ maf_aa = max(maf_aa,mafs[1]/100)
+
+ if 'GTC' in infomap:
+ gtcs = map(int,infomap['GTC'].split(','))
+ gtc = max(gtc,gtcs[0]+gtcs[1])
+
+ maf = max(maf_ea,maf_aa)
+
+ v.filter.append('ESP')
+ if maf>options.commonscore:
+ v.filter.append('ESPCommon')
+
+ new_info.append('ESP_COUNT=%d' % gtc)
+ new_info.append('ESP_MAF_EA=%.4f' % maf_ea)
+ new_info.append('ESP_MAF_AA=%.4f' % maf_aa)
+
if kaviar:
kinfo = kaviar.get(v.chrom,v.start) or []
ktext = [ stuff.replace(';',',') for
chrom,loc,mallele,maf,allele,stuff in kinfo if allele in v.var ]
@@ -227,9 +275,6 @@
v.names.remove('tgp')
v.filter.append('1000G')
- #if any(g[0]=='./.' for g in v.genos):
- # v.filter.append('PartiallyCalled')
-
if not v.ref or v.var==['']:
v.filter.append('Indel')
@@ -246,6 +291,7 @@
vs = VariantAnnotator(options.genedb, options.reference)
vcf = VCFReader(options.variants,sys.stdin)
cv = CGFVariants(options.cgfvariants, options.reference) if
options.cgfvariants else None
+ esp = VCFReader(options.esp) if options.esp else None
if options.kaviar:
references = list_reader(options.reference+'.fai')
@@ -268,20 +314,6 @@
metadata['FILTER'].append('##FILTER=<ID=SegDup,Description="Variant
occurs in a segmentally duplicated region">')
metadata['FILTER'].append('##FILTER=<ID=Repeat,Description="Variant
occurs in a repetitive or low-complexity region">')
- if cv:
- metadata['FILTER'].append('##FILTER=<ID=Common,Description="Variant is
likely common with common score>%f">' % options.commonscore)
- metadata['FILTER'].append('##FILTER=<ID=1000G,Description="Variant was
reported by 1000 Genomes project">')
-
- if kaviar:
-
metadata['FILTER'].append('##FILTER=<ID=KaviarCommon,Description="Variant
appears in the Kaviar database and appears to be common with MAF>%f">' %
options.commonscore)
- metadata['FILTER'].append('##FILTER=<ID=Kaviar,Description="Variant
appears in the Kaviar database">')
-
- if refvars:
- metadata['FILTER'].append('##FILTER=<ID=RefVar,Description="Variant
appears in the local reference variant list">')
-
-
#metadata['FILTER'].append('##FILTER=<ID=NotDominant,Description="Variant
does not fit dominant heritibility model">')
-
#metadata['FILTER'].append('##FILTER=<ID=NotRecessive,Description="Variant
does not fit recessive heritibility model">')
-
metadata['INFO'].append('##INFO=<ID=CYTOBAND,Number=.,Type=String,Description="Name
of cytoband(s) containing variant">')
metadata['INFO'].append('##INFO=<ID=GENE_NAME,Number=.,Type=String,Description="Name
of gene(s) containing variant">')
metadata['INFO'].append('##INFO=<ID=GENE_ID,Number=.,Type=String,Description="Entrez/LocusLink
gene identifiers of genes containing variant">')
@@ -294,24 +326,36 @@
metadata['INFO'].append('##INFO=<ID=REPEAT_INFO,Number=.,Type=String,Description="Details
of repetitive or low complexity elements that overlap variant locus">')
if cv:
-
metadata['INFO'].append('##INFO=<ID=COMMON_SCORE,Number=1,Type=Float,Description="Common
score: maximum allele frequency in any population for rarest allele">')
-
metadata['INFO'].append('##INFO=<ID=FUNCTION_INFO,Number=.,Type=String,Description="Annotated
as function by OMIM, dbSNP, or COSMIC">')
-
metadata['INFO'].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Observed
variant matches inexactly: it has different alleles or overlaps observed">')
+ metadata['FILTER'].append('##FILTER=<ID=Common,Description="Variant is
likely common with common score>%f">' % options.commonscore)
+ metadata['FILTER'].append('##FILTER=<ID=1000G,Description="Variant was
reported by 1000 Genomes project">')
+ metadata['INFO'
].append('##INFO=<ID=COMMON_SCORE,Number=1,Type=Float,Description="Common
score: maximum allele frequency in any population for rarest allele">')
+ metadata['INFO'
].append('##INFO=<ID=FUNCTION_INFO,Number=.,Type=String,Description="Annotated
as function by OMIM, dbSNP, or COSMIC">')
+ metadata['INFO'
].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Observed
variant matches inexactly: it has different alleles or overlaps observed">')
+
+ if esp:
+ metadata['FILTER'].append('##FILTER=<ID=ESP,Description="Variant
appears in the UW ESP database">')
+ metadata['FILTER'].append('##FILTER=<ID=ESPCommon,Description="Variant
appears in the UW ESP database and appears to be common with MAF>%f">' %
options.commonscore)
+ metadata['INFO'
].append('##INFO=<ID=ESP_COUNT,Number=1,Type=Integer,Description="Count of
individuals with one or more variant alleles in UW ESP database">')
+ metadata['INFO'
].append('##INFO=<ID=ESP_MAF_EA,Number=1,Type=Float,Description="Minor
allele frequency in European-Americans according to UW ESP database">')
+ metadata['INFO'
].append('##INFO=<ID=ESP_MAF_AA,Number=1,Type=Float,Description="Minor
allele frequency in African-Americans according to UW ESP database">')
if kaviar:
-
metadata['INFO'].append('##INFO=<ID=KAVIAR_MAF,Number=1,Type=Float,Description="Minor
allele frequency accourding to Kaviar database">')
-
metadata['INFO'].append('##INFO=<ID=KAVIAR_NAMES,Number=.,Type=String,Description="Samples
or datasets from Kaviar in which variant was found">')
+ metadata['FILTER'].append('##FILTER=<ID=Kaviar,Description="Variant
appears in the Kaviar database">')
+
metadata['FILTER'].append('##FILTER=<ID=KaviarCommon,Description="Variant
appears in the Kaviar database and appears to be common with MAF>%f">' %
options.commonscore)
+ metadata['INFO'
].append('##INFO=<ID=KAVIAR_MAF,Number=1,Type=Float,Description="Minor
allele frequency according to Kaviar database">')
+ metadata['INFO'
].append('##INFO=<ID=KAVIAR_NAMES,Number=.,Type=String,Description="Samples
or datasets from Kaviar in which variant was found">')
if refvars:
-
metadata['INFO'].append('##INFO=<ID=REFVAR_INGROUP_COUNT,Number=1,Type=Integer,Description="Count
of times variant is present in intra-group samples">')
-
metadata['INFO'].append('##INFO=<ID=REFVAR_OUTGROUP_COUNT,Number=1,Type=Integer,Description="Count
of times variant is present in extra-group samples">')
-
metadata['INFO'].append('##INFO=<ID=REFVAR_INGROUP_NAMES,Number=.,Type=String,Description="Intra-group
samples in which Variant is present">')
-
metadata['INFO'].append('##INFO=<ID=REFVAR_OUTGROUP_NAMES,Number=.,Type=String,Description="Extra-group
samples in which Variant is present">')
+ metadata['FILTER'].append('##FILTER=<ID=RefVar,Description="Variant
appears in the local reference variant list">')
+ metadata['INFO'
].append('##INFO=<ID=REFVAR_INGROUP_COUNT,Number=1,Type=Integer,Description="Count
of times variant is present in intra-group samples">')
+ metadata['INFO'
].append('##INFO=<ID=REFVAR_OUTGROUP_COUNT,Number=1,Type=Integer,Description="Count
of times variant is present in extra-group samples">')
+ metadata['INFO'
].append('##INFO=<ID=REFVAR_INGROUP_NAMES,Number=.,Type=String,Description="Intra-group
samples in which Variant is present">')
+ metadata['INFO'
].append('##INFO=<ID=REFVAR_OUTGROUP_NAMES,Number=.,Type=String,Description="Extra-group
samples in which Variant is present">')
out = VCFWriter(options.output, metadata, vcf.samples, options.reference)
for v in vcf:
- update_vcf_annotation(v, vs, cv, kaviar, refvars, options)
+ update_vcf_annotation(v, vs, cv, esp, kaviar, refvars, options)
out.write_locus(v)
@@ -420,6 +464,8 @@
help='CGFvariant database annotation')
parser.add_argument('--commonscore', metavar='T', type=float,
default=0.05,
help='Annotate all variants with common score > T')
+ parser.add_argument('--esp', metavar='NAME',
+ help='UW Exome Sequencing Project (ESP) annotation
(optional)')
parser.add_argument('--kaviar', metavar='NAME',
help='Kaviar annotation (optional)')
parser.add_argument('--refvars', metavar='NAME',