[glu-genetics] 5 new revisions pushed by bioinformed@gmail.com on 2012-02-21 14:37 GMT

2 views

Skip to first unread message

glu-ge...@googlecode.com

unread,

Feb 21, 2012, 9:38:39 AM2/21/12

to glu...@googlegroups.com

5 new revisions:

Revision: e2f457c90144
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:26:21 2012
Log: Decode function_score into database names.
http://code.google.com/p/glu-genetics/source/detail?r=e2f457c90144

Revision: 4dac9b1d0ecf
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:06 2012
Log: Made gene queries return only canonical mappings by default.
http://code.google.com/p/glu-genetics/source/detail?r=4dac9b1d0ecf

Revision: f3db1ea6a13e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:56 2012
Log: Return function info, rather than function score now that
decoding is...
http://code.google.com/p/glu-genetics/source/detail?r=f3db1ea6a13e

Revision: ae88c5a0d567
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:31:32 2012
Log: Update detection of loss-of-stop variants now that we annotate
complex...
http://code.google.com/p/glu-genetics/source/detail?r=ae88c5a0d567

Revision: 2afd63200a75
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:32:18 2012
Log: Add code to allow remapping contigs.
http://code.google.com/p/glu-genetics/source/detail?r=2afd63200a75

==============================================================================
Revision: e2f457c90144
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:26:21 2012
Log: Decode function_score into database names.
http://code.google.com/p/glu-genetics/source/detail?r=e2f457c90144

Modified:
/glu/lib/seqlib/cgfvariants.py

=======================================
--- /glu/lib/seqlib/cgfvariants.py Wed Feb 1 08:26:47 2012
+++ /glu/lib/seqlib/cgfvariants.py Tue Feb 21 06:26:21 2012
@@ -10,7 +10,7 @@
from glu.lib.recordtype import recordtype

-VarInfo = namedtuple('VarInfo', 'exact_vars inexact_vars common_score
function_score')
+VarInfo = namedtuple('VarInfo', 'exact_vars inexact_vars common_score
function_info')
VarRecord = recordtype('VarRecord', 'chromosome start stop allele
common_score function_score source')
VariantKey = namedtuple('VariantKey', 'chromosome start stop source')

@@ -91,4 +91,11 @@
inexact_vars = sorted(set(inexact_vars))
common_score = min(common_score.values()) if common_score else 0

- return VarInfo(exact_vars,inexact_vars,common_score,function_score)
+ functions = []
+ if function_score:
+ if function_score&0x01: functions.append('cosmic')
+ if function_score&0x02: functions.append('lsdb')
+ if function_score&0x04: functions.append('other')
+ if function_score&0x08: functions.append('omim')
+
+ return VarInfo(exact_vars,inexact_vars,common_score,functions)

==============================================================================
Revision: 4dac9b1d0ecf
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:06 2012
Log: Made gene queries return only canonical mappings by default.
http://code.google.com/p/glu-genetics/source/detail?r=4dac9b1d0ecf

Modified:
/glu/lib/genedb/queries.py

=======================================
--- /glu/lib/genedb/queries.py Wed Jan 4 10:09:24 2012
+++ /glu/lib/genedb/queries.py Tue Feb 21 06:30:06 2012
@@ -22,7 +22,7 @@
cyto_re = re.compile('(\d+|X|Y)(?:([p|q])(?:(\d+)(.\d+)?)?)?$')

-def query_genes_by_name(con, gene, canonical=None, mapped=None):
+def query_genes_by_name(con, gene, canonical=1, mapped=None):
sql = '''
SELECT a.Alias,g.symbol,g.chrom,MIN(g.txStart) as start,MAX(g.txEnd)
as end,g.strand,"GENE"
FROM alias a, gene g
@@ -35,6 +35,7 @@

if canonical is not None:
conditions.append('g.canonical & %d' % canonical)
+ conditions.append("g.chrom NOT LIKE '%!_%' ESCAPE '!'")

if mapped:
conditions += [ 'g.txStart<>""', 'g.txEnd<>""' ]
@@ -50,14 +51,28 @@

cur = con.cursor()
cur.execute(sql, (gene,))
- return cur.fetchall()
-
-
-def query_gene_by_name(con,gene):
+ genes = cur.fetchall()
+
+ if len(genes)>1:
+ new_genes = [ g for g in genes if g[1]==gene ]
+ if new_genes:
+ genes = new_genes
+
+ if len(genes)>1:
+ new_genes = [ g for g in genes if g[1].upper()==gene.upper() ]
+ if new_genes:
+ genes = new_genes
+
+ return genes
+
+
+def query_gene_by_name(con,gene,canonical=None,mapped=None):
genes = query_genes_by_name(con,gene)
+
if not genes:
raise KeyError('Cannot find gene "%s"' % gene)
elif len(genes) > 1:
+
raise KeyError('Gene not unique "%s"' % gene)
return genes[0]

==============================================================================
Revision: f3db1ea6a13e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:56 2012
Log: Return function info, rather than function score now that
decoding is
performed.
http://code.google.com/p/glu-genetics/source/detail?r=f3db1ea6a13e

Modified:
/glu/modules/seq/annotate.py

=======================================
--- /glu/modules/seq/annotate.py Tue Jan 3 10:35:13 2012
+++ /glu/modules/seq/annotate.py Tue Feb 21 06:30:56 2012
@@ -154,12 +154,15 @@
cvinfo = cv.score_and_classify(v.chrom,v.start,v.end,[v.ref,v.var[0]])
if cvinfo.exact_vars:
v.names = sorted(set(v.replace('dbsnp:','rs') for v in
cvinfo.exact_vars)|set(v.names))
- inexact = ','.join(sorted(set(cvinfo.inexact_vars))) if
cvinfo.inexact_vars else ''

new_info.append('COMMON_SCORE=%.2f' % cvinfo.common_score)
- new_info.append('FUNCTION_SCORE=%d' % cvinfo.function_score)
-
- if inexact:
+
+ if cvinfo.function_info:
+ function_info = ','.join(cvinfo.function_info)
+ new_info.append('FUNCTION_INFO=%s' % function_info)
+
+ if cvinfo.inexact_vars:
+ inexact = ','.join(sorted(set(cvinfo.inexact_vars)))
new_info.append('INEXACT_VARIANTS=%s' % inexact)

if cvinfo.common_score>options.commonscore:
@@ -201,8 +204,11 @@
v.names.remove('tgp')
v.filter.append('1000G')

- if './.' in v.genos:
- filter.append('PartiallyCalled')
+ if any(g[0]=='./.' for g in v.genos):
+ v.filter.append('PartiallyCalled')
+
+ if not v.ref or v.var==['']:
+ v.filter.append('Indel')

# Remove any old fields that have been replaced by a new field
new_info_fields = set(f.split('=',1)[0] for f in new_info)
@@ -231,6 +237,7 @@

metadata = vcf.metadata

metadata['FILTER'].append('##FILTER=<ID=PartiallyCalled,Description="Variant
is not called for one or more samples">')
+ metadata['FILTER'].append('##FILTER=<ID=Indel,Description="Variant is an
insertion or deletion">')
metadata['FILTER'].append('##FILTER=<ID=Intergenic,Description="Variant
not in or near a gene">')

metadata['FILTER'].append('##FILTER=<ID=NotPredictedFunctional,Description="Variant
is not predicted to alter a protein">')

@@ -257,8 +264,8 @@

if cv:

metadata['INFO'].append('##INFO=<ID=COMMON_SCORE,Number=1,Type=Float,Description="Common
score: maximum allele frequency in any population for rarest allele">')
-
metadata['INFO'].append('##INFO=<ID=FUNCTION_SCORE,Number=1,Type=Int,Description="Function
score: reported as functional variant in OMIM, dbSNP, or COSMIC">')
-
metadata['INFO'].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Inexact
variant matche">')
+
metadata['INFO'].append('##INFO=<ID=FUNCTION_INFO,Number=.,Type=String,Description="Annotated
as function by OMIM, dbSNP, or COSMIC">')
+
metadata['INFO'].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Observed
variant matches inexactly: it has different alleles or overlaps observed">')

if kaviar:

metadata['INFO'].append('##INFO=<ID=KAVIAR_MAF,Number=1,Type=Float,Description="Minor
allele frequency accourding to Kaviar database">')

==============================================================================
Revision: ae88c5a0d567
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:31:32 2012
Log: Update detection of loss-of-stop variants now that we annotate
complex
variants.
http://code.google.com/p/glu-genetics/source/detail?r=ae88c5a0d567

Modified:
/glu/lib/seqlib/vannotator.py

=======================================
--- /glu/lib/seqlib/vannotator.py Tue Jan 3 10:35:13 2012
+++ /glu/lib/seqlib/vannotator.py Tue Feb 21 06:31:32 2012
@@ -39,6 +39,31 @@
GeneEvidence = recordtype('GeneEvidence', 'chrom cytoband ref_start
ref_end intersect gene details '
'func func_class func_type
ref_nuc var_nuc ref_aa var_aa')

+
+def find_all(s, ss):
+ start = 0
+ n = len(ss)
+ while True:
+ start = s.find(ss, start)
+ if start == -1: return
+ yield start
+ start += n
+
+
+def common_suffix(s1,s2):
+ min_len = min(len(s1), len(s2) )
+ suffix = []
+
+ for c1,c2 in izip(reversed(s1),reversed(s2)):
+ if c1==c2:
+ suffix.append(c1)
+ else:
+ break
+
+ suffix.reverse()
+ return ''.join(suffix)
+
+
def decode_gene(gene):
# Decode CDS boundaries and exon start and end coordinates
cds_start = int(gene.cdsStart)
@@ -373,8 +398,9 @@
result += [False,'SYNONYMOUS','REFERENCE',ref_nuc,var_nuc,'','']
return result

- ref_frame = len(ref_nuc)%3
- var_frame = len(var_nuc)%3
+ ref_frame = len(ref_nuc)%3
+ var_frame = len(var_nuc)%3
+ frameshift = (len(ref_nuc)-len(var_nuc))%3

if 0:
print ' REF_FRAME: %d' % ref_frame
@@ -463,22 +489,23 @@

# Classify non-synonymous change by comparing AA sequences

- ref_stop = ref_aa.find('*')
- var_stop = var_aa.find('*')
-
- if ref_stop==-1:
- ref_stop = len(ref_aa)
- if var_stop==-1:
- var_stop = len(var_aa)
-
- if var_stop<ref_stop:
- mut_type.append('PREMATURE STOP')
- elif var_stop>ref_stop:
- mut_type.append('LOSS OF STOP')
-
- if len(ref_aa)==len(var_aa):
+ # Make sure ref protein doesn't appear to have spurious stops
+
+ r = ref_cds_aa.rstrip('*')
+ v = var_cds_aa.rstrip('*')
+
+ ref_stop = r.find('*')
+ var_stop = v.find('*')
+
+ if ref_stop==-1:
+ if var_stop!=-1 and not v.startswith(r):
+ mut_type.append('PREMATURE STOP')
+ elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*':
+ mut_type.append('LOSS OF STOP')
+
+ if len(r)==len(v):
mut_type.append('SUBSTITUTION')
- elif len(ref_aa)>len(var_aa):
+ elif len(r)>len(v):
mut_type.append('DELETION')
else:
mut_type.append('INSERTION')

==============================================================================
Revision: 2afd63200a75
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:32:18 2012
Log: Add code to allow remapping contigs.
http://code.google.com/p/glu-genetics/source/detail?r=2afd63200a75

Modified:
/glu/modules/seq/filter.py

=======================================
--- /glu/modules/seq/filter.py Thu Dec 29 11:09:15 2011
+++ /glu/modules/seq/filter.py Tue Feb 21 06:32:18 2012
@@ -13,15 +13,16 @@

from collections import deque
from operator import attrgetter, itemgetter
-from itertools import groupby, imap, izip
+from itertools import groupby, imap, izip, count

import pysam

from glu.lib.utils import consume
-from glu.lib.fileutils import autofile, hyphen, list_reader,
table_writer
+from glu.lib.fileutils import autofile, hyphen, list_reader,
table_writer, table_reader
from glu.lib.progressbar import progress_loop

from glu.lib.seqlib.bed import read_features
+from glu.lib.seqlib.cigar import fix_cigar_bam, cigar_bam_str
from glu.lib.seqlib.filter import alignment_filter_options,
filter_alignments

@@ -43,23 +44,57 @@
rg = ('RG',groupname)

for align in aligns:
- tags = align.tags or []
-
- try:
- names,values = zip(*tags)
- idx = names.index('RG')
- tags[idx] = rg
-
- except ValueError:
- tags.append(rg)
-
- align.tags = tags
+ tags = align.tags
+
+ if tags:
+ align.tags = tags = [ (k,v) for k,v in tags if k[0]!='X' ]
+
+ if tags is None:
+ align.tags = [rg]
+ elif not tags:
+ align.tags.append(rg)
+ else:
+ for i,(name,value) in enumerate(tags):
+ if name=='RG':
+ tags[i] = rg
+ break
+ else:
+ tags.append(rg)
+
+ align.tags = tags
+
+ #tags = align.tags or []
+ #
+ #try:
+ # names,values = zip(*tags)
+ # idx = names.index('RG')
+ # tags[idx] = rg
+ #
+ #except ValueError:
+ # tags.append(rg)
+ #
+ #align.tags = tags

yield align

return _set_readgroup()

+from glu.lib.seqlib.edits import reduce_match
+
+def fix_cigar_indels(aligns):
+ for align in aligns:
+ if align.tid>=0:
+ #before = align.cigar
+ align.cigar = fix_cigar_bam(align.cigar)
+ #if align.cigar is not before:
+ # print 'BEFORE:',cigar_bam_str(before)
+ # print 'AFTER: ',cigar_bam_str(align.cigar)
+ # print
+
+ yield align
+
+
def contig_stats(aligns,references,filename):
stats = [0]*len(references)
for align in aligns:
@@ -75,6 +110,7 @@
def simple_filter(aligns,controls,stats,options):
minreadlen = options.minreadlen

+ keep = options.action=='keep'
fail = options.action=='fail'

reads_ONTARGET = 0
@@ -100,7 +136,9 @@
bases_UNALIGNED += rlen
lengs_UNALIGNED[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -112,7 +150,9 @@
bases_CONTROL += rlen
lengs_CONTROL[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -125,7 +165,9 @@
bases_TOOSHORT += rlen
lengs_TOOSHORT[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -154,6 +196,7 @@
minreadlen = options.minreadlen
nulltarget = sys.maxint,sys.maxint

+ keep = options.action=='keep'
fail = options.action=='fail'

reads_ONTARGET = 0
@@ -191,7 +234,9 @@
bases_UNALIGNED += rlen
lengs_UNALIGNED[rlen] += 1