[glu-genetics] 5 new revisions pushed by bioinformed@gmail.com on 2012-02-21 14:37 GMT

2 views
Skip to first unread message

glu-ge...@googlecode.com

unread,
Feb 21, 2012, 9:38:39 AM2/21/12
to glu...@googlegroups.com
5 new revisions:

Revision: e2f457c90144
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:26:21 2012
Log: Decode function_score into database names.
http://code.google.com/p/glu-genetics/source/detail?r=e2f457c90144

Revision: 4dac9b1d0ecf
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:06 2012
Log: Made gene queries return only canonical mappings by default.
http://code.google.com/p/glu-genetics/source/detail?r=4dac9b1d0ecf

Revision: f3db1ea6a13e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:56 2012
Log: Return function info, rather than function score now that
decoding is...
http://code.google.com/p/glu-genetics/source/detail?r=f3db1ea6a13e

Revision: ae88c5a0d567
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:31:32 2012
Log: Update detection of loss-of-stop variants now that we annotate
complex...
http://code.google.com/p/glu-genetics/source/detail?r=ae88c5a0d567

Revision: 2afd63200a75
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:32:18 2012
Log: Add code to allow remapping contigs.
http://code.google.com/p/glu-genetics/source/detail?r=2afd63200a75

==============================================================================
Revision: e2f457c90144
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:26:21 2012
Log: Decode function_score into database names.
http://code.google.com/p/glu-genetics/source/detail?r=e2f457c90144

Modified:
/glu/lib/seqlib/cgfvariants.py

=======================================
--- /glu/lib/seqlib/cgfvariants.py Wed Feb 1 08:26:47 2012
+++ /glu/lib/seqlib/cgfvariants.py Tue Feb 21 06:26:21 2012
@@ -10,7 +10,7 @@
from glu.lib.recordtype import recordtype


-VarInfo = namedtuple('VarInfo', 'exact_vars inexact_vars common_score
function_score')
+VarInfo = namedtuple('VarInfo', 'exact_vars inexact_vars common_score
function_info')
VarRecord = recordtype('VarRecord', 'chromosome start stop allele
common_score function_score source')
VariantKey = namedtuple('VariantKey', 'chromosome start stop source')

@@ -91,4 +91,11 @@
inexact_vars = sorted(set(inexact_vars))
common_score = min(common_score.values()) if common_score else 0

- return VarInfo(exact_vars,inexact_vars,common_score,function_score)
+ functions = []
+ if function_score:
+ if function_score&0x01: functions.append('cosmic')
+ if function_score&0x02: functions.append('lsdb')
+ if function_score&0x04: functions.append('other')
+ if function_score&0x08: functions.append('omim')
+
+ return VarInfo(exact_vars,inexact_vars,common_score,functions)

==============================================================================
Revision: 4dac9b1d0ecf
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:06 2012
Log: Made gene queries return only canonical mappings by default.
http://code.google.com/p/glu-genetics/source/detail?r=4dac9b1d0ecf

Modified:
/glu/lib/genedb/queries.py

=======================================
--- /glu/lib/genedb/queries.py Wed Jan 4 10:09:24 2012
+++ /glu/lib/genedb/queries.py Tue Feb 21 06:30:06 2012
@@ -22,7 +22,7 @@
cyto_re = re.compile('(\d+|X|Y)(?:([p|q])(?:(\d+)(.\d+)?)?)?$')


-def query_genes_by_name(con, gene, canonical=None, mapped=None):
+def query_genes_by_name(con, gene, canonical=1, mapped=None):
sql = '''
SELECT a.Alias,g.symbol,g.chrom,MIN(g.txStart) as start,MAX(g.txEnd)
as end,g.strand,"GENE"
FROM alias a, gene g
@@ -35,6 +35,7 @@

if canonical is not None:
conditions.append('g.canonical & %d' % canonical)
+ conditions.append("g.chrom NOT LIKE '%!_%' ESCAPE '!'")

if mapped:
conditions += [ 'g.txStart<>""', 'g.txEnd<>""' ]
@@ -50,14 +51,28 @@

cur = con.cursor()
cur.execute(sql, (gene,))
- return cur.fetchall()
-
-
-def query_gene_by_name(con,gene):
+ genes = cur.fetchall()
+
+ if len(genes)>1:
+ new_genes = [ g for g in genes if g[1]==gene ]
+ if new_genes:
+ genes = new_genes
+
+ if len(genes)>1:
+ new_genes = [ g for g in genes if g[1].upper()==gene.upper() ]
+ if new_genes:
+ genes = new_genes
+
+ return genes
+
+
+def query_gene_by_name(con,gene,canonical=None,mapped=None):
genes = query_genes_by_name(con,gene)
+
if not genes:
raise KeyError('Cannot find gene "%s"' % gene)
elif len(genes) > 1:
+
raise KeyError('Gene not unique "%s"' % gene)
return genes[0]


==============================================================================
Revision: f3db1ea6a13e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:30:56 2012
Log: Return function info, rather than function score now that
decoding is
performed.
http://code.google.com/p/glu-genetics/source/detail?r=f3db1ea6a13e

Modified:
/glu/modules/seq/annotate.py

=======================================
--- /glu/modules/seq/annotate.py Tue Jan 3 10:35:13 2012
+++ /glu/modules/seq/annotate.py Tue Feb 21 06:30:56 2012
@@ -154,12 +154,15 @@
cvinfo = cv.score_and_classify(v.chrom,v.start,v.end,[v.ref,v.var[0]])
if cvinfo.exact_vars:
v.names = sorted(set(v.replace('dbsnp:','rs') for v in
cvinfo.exact_vars)|set(v.names))
- inexact = ','.join(sorted(set(cvinfo.inexact_vars))) if
cvinfo.inexact_vars else ''

new_info.append('COMMON_SCORE=%.2f' % cvinfo.common_score)
- new_info.append('FUNCTION_SCORE=%d' % cvinfo.function_score)
-
- if inexact:
+
+ if cvinfo.function_info:
+ function_info = ','.join(cvinfo.function_info)
+ new_info.append('FUNCTION_INFO=%s' % function_info)
+
+ if cvinfo.inexact_vars:
+ inexact = ','.join(sorted(set(cvinfo.inexact_vars)))
new_info.append('INEXACT_VARIANTS=%s' % inexact)

if cvinfo.common_score>options.commonscore:
@@ -201,8 +204,11 @@
v.names.remove('tgp')
v.filter.append('1000G')

- if './.' in v.genos:
- filter.append('PartiallyCalled')
+ if any(g[0]=='./.' for g in v.genos):
+ v.filter.append('PartiallyCalled')
+
+ if not v.ref or v.var==['']:
+ v.filter.append('Indel')

# Remove any old fields that have been replaced by a new field
new_info_fields = set(f.split('=',1)[0] for f in new_info)
@@ -231,6 +237,7 @@

metadata = vcf.metadata

metadata['FILTER'].append('##FILTER=<ID=PartiallyCalled,Description="Variant
is not called for one or more samples">')
+ metadata['FILTER'].append('##FILTER=<ID=Indel,Description="Variant is an
insertion or deletion">')
metadata['FILTER'].append('##FILTER=<ID=Intergenic,Description="Variant
not in or near a gene">')

metadata['FILTER'].append('##FILTER=<ID=NotPredictedFunctional,Description="Variant
is not predicted to alter a protein">')

@@ -257,8 +264,8 @@

if cv:

metadata['INFO'].append('##INFO=<ID=COMMON_SCORE,Number=1,Type=Float,Description="Common
score: maximum allele frequency in any population for rarest allele">')
-
metadata['INFO'].append('##INFO=<ID=FUNCTION_SCORE,Number=1,Type=Int,Description="Function
score: reported as functional variant in OMIM, dbSNP, or COSMIC">')
-
metadata['INFO'].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Inexact
variant matche">')
+
metadata['INFO'].append('##INFO=<ID=FUNCTION_INFO,Number=.,Type=String,Description="Annotated
as function by OMIM, dbSNP, or COSMIC">')
+
metadata['INFO'].append('##INFO=<ID=INEXACT_VARIANTS,Number=.,Type=String,Description="Observed
variant matches inexactly: it has different alleles or overlaps observed">')

if kaviar:

metadata['INFO'].append('##INFO=<ID=KAVIAR_MAF,Number=1,Type=Float,Description="Minor
allele frequency accourding to Kaviar database">')

==============================================================================
Revision: ae88c5a0d567
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:31:32 2012
Log: Update detection of loss-of-stop variants now that we annotate
complex
variants.
http://code.google.com/p/glu-genetics/source/detail?r=ae88c5a0d567

Modified:
/glu/lib/seqlib/vannotator.py

=======================================
--- /glu/lib/seqlib/vannotator.py Tue Jan 3 10:35:13 2012
+++ /glu/lib/seqlib/vannotator.py Tue Feb 21 06:31:32 2012
@@ -39,6 +39,31 @@
GeneEvidence = recordtype('GeneEvidence', 'chrom cytoband ref_start
ref_end intersect gene details '
'func func_class func_type
ref_nuc var_nuc ref_aa var_aa')

+
+def find_all(s, ss):
+ start = 0
+ n = len(ss)
+ while True:
+ start = s.find(ss, start)
+ if start == -1: return
+ yield start
+ start += n
+
+
+def common_suffix(s1,s2):
+ min_len = min(len(s1), len(s2) )
+ suffix = []
+
+ for c1,c2 in izip(reversed(s1),reversed(s2)):
+ if c1==c2:
+ suffix.append(c1)
+ else:
+ break
+
+ suffix.reverse()
+ return ''.join(suffix)
+
+
def decode_gene(gene):
# Decode CDS boundaries and exon start and end coordinates
cds_start = int(gene.cdsStart)
@@ -373,8 +398,9 @@
result += [False,'SYNONYMOUS','REFERENCE',ref_nuc,var_nuc,'','']
return result

- ref_frame = len(ref_nuc)%3
- var_frame = len(var_nuc)%3
+ ref_frame = len(ref_nuc)%3
+ var_frame = len(var_nuc)%3
+ frameshift = (len(ref_nuc)-len(var_nuc))%3

if 0:
print ' REF_FRAME: %d' % ref_frame
@@ -463,22 +489,23 @@

# Classify non-synonymous change by comparing AA sequences

- ref_stop = ref_aa.find('*')
- var_stop = var_aa.find('*')
-
- if ref_stop==-1:
- ref_stop = len(ref_aa)
- if var_stop==-1:
- var_stop = len(var_aa)
-
- if var_stop<ref_stop:
- mut_type.append('PREMATURE STOP')
- elif var_stop>ref_stop:
- mut_type.append('LOSS OF STOP')
-
- if len(ref_aa)==len(var_aa):
+ # Make sure ref protein doesn't appear to have spurious stops
+
+ r = ref_cds_aa.rstrip('*')
+ v = var_cds_aa.rstrip('*')
+
+ ref_stop = r.find('*')
+ var_stop = v.find('*')
+
+ if ref_stop==-1:
+ if var_stop!=-1 and not v.startswith(r):
+ mut_type.append('PREMATURE STOP')
+ elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*':
+ mut_type.append('LOSS OF STOP')
+
+ if len(r)==len(v):
mut_type.append('SUBSTITUTION')
- elif len(ref_aa)>len(var_aa):
+ elif len(r)>len(v):
mut_type.append('DELETION')
else:
mut_type.append('INSERTION')

==============================================================================
Revision: 2afd63200a75
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Tue Feb 21 06:32:18 2012
Log: Add code to allow remapping contigs.
http://code.google.com/p/glu-genetics/source/detail?r=2afd63200a75

Modified:
/glu/modules/seq/filter.py

=======================================
--- /glu/modules/seq/filter.py Thu Dec 29 11:09:15 2011
+++ /glu/modules/seq/filter.py Tue Feb 21 06:32:18 2012
@@ -13,15 +13,16 @@

from collections import deque
from operator import attrgetter, itemgetter
-from itertools import groupby, imap, izip
+from itertools import groupby, imap, izip, count

import pysam

from glu.lib.utils import consume
-from glu.lib.fileutils import autofile, hyphen, list_reader,
table_writer
+from glu.lib.fileutils import autofile, hyphen, list_reader,
table_writer, table_reader
from glu.lib.progressbar import progress_loop

from glu.lib.seqlib.bed import read_features
+from glu.lib.seqlib.cigar import fix_cigar_bam, cigar_bam_str
from glu.lib.seqlib.filter import alignment_filter_options,
filter_alignments


@@ -43,23 +44,57 @@
rg = ('RG',groupname)

for align in aligns:
- tags = align.tags or []
-
- try:
- names,values = zip(*tags)
- idx = names.index('RG')
- tags[idx] = rg
-
- except ValueError:
- tags.append(rg)
-
- align.tags = tags
+ tags = align.tags
+
+ if tags:
+ align.tags = tags = [ (k,v) for k,v in tags if k[0]!='X' ]
+
+ if tags is None:
+ align.tags = [rg]
+ elif not tags:
+ align.tags.append(rg)
+ else:
+ for i,(name,value) in enumerate(tags):
+ if name=='RG':
+ tags[i] = rg
+ break
+ else:
+ tags.append(rg)
+
+ align.tags = tags
+
+ #tags = align.tags or []
+ #
+ #try:
+ # names,values = zip(*tags)
+ # idx = names.index('RG')
+ # tags[idx] = rg
+ #
+ #except ValueError:
+ # tags.append(rg)
+ #
+ #align.tags = tags

yield align

return _set_readgroup()


+from glu.lib.seqlib.edits import reduce_match
+
+def fix_cigar_indels(aligns):
+ for align in aligns:
+ if align.tid>=0:
+ #before = align.cigar
+ align.cigar = fix_cigar_bam(align.cigar)
+ #if align.cigar is not before:
+ # print 'BEFORE:',cigar_bam_str(before)
+ # print 'AFTER: ',cigar_bam_str(align.cigar)
+ # print
+
+ yield align
+
+
def contig_stats(aligns,references,filename):
stats = [0]*len(references)
for align in aligns:
@@ -75,6 +110,7 @@
def simple_filter(aligns,controls,stats,options):
minreadlen = options.minreadlen

+ keep = options.action=='keep'
fail = options.action=='fail'

reads_ONTARGET = 0
@@ -100,7 +136,9 @@
bases_UNALIGNED += rlen
lengs_UNALIGNED[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -112,7 +150,9 @@
bases_CONTROL += rlen
lengs_CONTROL[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -125,7 +165,9 @@
bases_TOOSHORT += rlen
lengs_TOOSHORT[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = 1
yield align

@@ -154,6 +196,7 @@
minreadlen = options.minreadlen
nulltarget = sys.maxint,sys.maxint

+ keep = options.action=='keep'
fail = options.action=='fail'

reads_ONTARGET = 0
@@ -191,7 +234,9 @@
bases_UNALIGNED += rlen
lengs_UNALIGNED[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -205,7 +250,9 @@
bases_CONTROL += rlen
lengs_CONTROL[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -222,7 +269,9 @@
bases_TOOSHORT += rlen
lengs_TOOSHORT[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -236,7 +285,9 @@
bases_OFFTARGET += rlen
lengs_OFFTARGET[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -279,7 +330,9 @@
bases_OFFTARGET += rlen
lengs_OFFTARGET[rlen] += 1

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -306,6 +359,7 @@
minoverlap = options.minoverlap

fail = options.action=='fail'
+ keep = options.action=='keep'
reads = stats.reads
bases = stats.bases

@@ -328,7 +382,9 @@
reads[UNALIGNED] += 1
bases[UNALIGNED] += align.rlen

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -341,7 +397,9 @@
reads[TOOSHORT] += 1
bases[TOOSHORT] += rlen

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align

@@ -358,7 +416,9 @@
reads[OFFTARGET] += 1
bases[OFFTARGET] += rlen

- if fail:
+ if keep:
+ yield align
+ elif fail:
align.is_qcfail = True
yield align
else:
@@ -371,7 +431,7 @@
def __init__(self):
self.reads = [0]*STATUS_COUNT
self.bases = [0]*STATUS_COUNT
- self.lengths = [ [0]*6000 for i in range(STATUS_COUNT) ]
+ self.lengths = [ [0]*10000 for i in range(STATUS_COUNT) ]


def sink_file(outbam,aligns):
@@ -435,6 +495,60 @@
return samfiles,header,references,lengths,aligns


+def load_contig_remap(filename):
+ remap = {}
+ data = table_reader(filename,
columns=['SRC_CONTIG','DST_CONTIG','SRC_OFFSET','DST_LEN'],want_header=False)
+
+ for src,dst,src_offset,dst_len in data:
+ src_offset = int(src_offset or 0)
+ dst_len = int(dst_len)
+
+ if src!=dst or offset:
+ remap[src] = dst,src_offset,dst_len
+
+ return remap
+
+
+def remap_contigs(header, references, lengths, alignments, remap):
+ src_remapped = set(r for r in references if r in remap)
+ dst_remapped = [ remap[r] for r in sorted(src_remapped) ]
+
+ tidmap = {}
+ new_references = []
+ new_lengths = []
+ new_sq = []
+ new_header = header.copy()
+ new_header['SQ'] = new_sq
+
+ for i,src_ref,src_len in izip(count(),references,lengths):
+ j = len(new_references)
+
+ if src_ref not in remap:
+ new_references.append(src_ref)
+ new_lengths.append(src_len)
+ tidmap[i] = j,0
+ new_sq.append( {'SN':src_ref, 'LN':src_len} )
+ else:
+ dst_ref,src_offset,dst_len = remap[src_ref]
+
+ new_references.append(dst_ref)
+ new_lengths.append(dst_len)
+ new_sq.append( {'SN':dst_ref, 'LN':dst_len} )
+
+ tidmap[i] = j,src_offset
+
+ def _alignments():
+ for align in alignments:
+ if align.tid>=0:
+ new_tid,offset = tidmap[align.tid]
+ align.tid = new_tid
+ align.pos += offset
+
+ yield align
+
+ return new_header,new_references,new_lengths,_alignments()
+
+
def option_parser():
from glu.lib.glu_argparse import GLUArgumentParser

@@ -456,10 +570,17 @@
help='Minimum alignment overlap with any target
(default=1)')

parser.add_argument('--action', metavar='X', default='fail',
- help='Action to perform on failing alignments (drop or
fail, default=fail)')
+ help='Action to perform on failing alignments (keep,
drop or fail, default=fail)')

parser.add_argument('--setreadgroup', type=str, metavar='RGNAME',
help='Set all reads to specified read group name')
+
+ parser.add_argument('--fixindels', action='store_true',
+ help='Fix insertions adjacent to deletions and set
them to mismatches.')
+
+ parser.add_argument('--remapcontig', metavar='FILE',
+ help='Remap contigs from original to new contigs with
an offset location')
+
parser.add_argument('-o', '--output', metavar='FILE',
help='Output BAM file')
parser.add_argument('-O', '--sumout', metavar='FILE', default='-',
@@ -476,7 +597,7 @@

options.action = options.action.lower()

- if options.action not in ('drop','fail'):
+ if options.action not in ('drop','fail','keep'):
raise ValueError('Invalid filter action selected')

samfiles = []
@@ -501,13 +622,17 @@
lengths = merger.lengths
aligns = iter(merger)

+ if options.remapcontig:
+ remap = load_contig_remap(options.remapcontig)
+ header,references,lengths,aligns = remap_contigs(header, references,
lengths, aligns, remap)
+
controls = set()
if options.controls:
rmap = dict( (contig,tid) for tid,contig in enumerate(references) )
controls = set(rmap.get(c) for c in set(list_reader(options.controls)))
controls.discard(None)

- aligns = progress_loop(aligns, label='Loading BAM file(s): ',
units='alignments')
+ #aligns = progress_loop(aligns, label='Loading BAM file(s): ',
units='alignments')
aligns = filter_alignments(aligns, options.includealign,
options.excludealign)

stats = FilterStats()
@@ -528,6 +653,9 @@
if options.setreadgroup:
aligns = set_readgroup(options.setreadgroup,header,aligns)

+ if options.fixindels:
+ aligns = fix_cigar_indels(aligns)
+
flags = 'wb' if options.output.endswith('.bam') else 'wh'

outbam = pysam.Samfile(options.output, flags, header=header,
@@ -554,7 +682,7 @@
total_reads = sum(reads)
total_bases = sum(bases)

- sumout = autofile(hyphen(options.sumout, sys.stdout),'w')
+ sumout = autofile(hyphen(options.sumout, sys.stderr),'w')

if complete:
sumout.write('Alignment summary:\n')

Reply all
Reply to author
Forward
0 new messages