[glu-genetics] 12 new revisions pushed by bioinformed@gmail.com on 2012-07-20 16:57 GMT

1 view

Skip to first unread message

glu-ge...@googlecode.com

unread,

Jul 20, 2012, 12:58:17 PM7/20/12

to glu...@googlegroups.com

12 new revisions:

Revision: dec0448b9eab
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jun 22 10:19:41 2012
Log: Add padding option to on-target filters
http://code.google.com/p/glu-genetics/source/detail?r=dec0448b9eab

Revision: df2cae6051ce
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:47:00 2012
Log: Remove spaces from annotations, since some VCF parsers are
painfully l...
http://code.google.com/p/glu-genetics/source/detail?r=df2cae6051ce

Revision: da5128386068
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:47:41 2012
Log: Perform slightly more intelligent chromosome name normalization
http://code.google.com/p/glu-genetics/source/detail?r=da5128386068

Revision: 445887b7ae8e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:48:31 2012
Log: Minor fixes for multi-allelic variants and added some optional
filters...
http://code.google.com/p/glu-genetics/source/detail?r=445887b7ae8e

Revision: c3b28309d596
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:48:49 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=c3b28309d596

Revision: 158c4d06e062
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:49:16 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=158c4d06e062

Revision: 568a4641e874
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:50:13 2012
Log: Add more models and use VCFWriter infrastructure
http://code.google.com/p/glu-genetics/source/detail?r=568a4641e874

Revision: d8260c256066
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:51:09 2012
Log: Better handle compressed input files
http://code.google.com/p/glu-genetics/source/detail?r=d8260c256066

Revision: 69683a256021
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:52:06 2012
Log: Add support for IonTorrent variant output files
http://code.google.com/p/glu-genetics/source/detail?r=69683a256021

Revision: 7a5a01e793b2
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:52:28 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=7a5a01e793b2

Revision: 53a419da4d43
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:54:30 2012
Log: Cleanups and fixes to allow better filtering of samples and use
of...
http://code.google.com/p/glu-genetics/source/detail?r=53a419da4d43

Revision: 87c5a1f4e1f0
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:55:49 2012
Log: Improve VCF writer to accommodate parsers that are strict about
disall...
http://code.google.com/p/glu-genetics/source/detail?r=87c5a1f4e1f0

==============================================================================
Revision: dec0448b9eab
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jun 22 10:19:41 2012
Log: Add padding option to on-target filters
http://code.google.com/p/glu-genetics/source/detail?r=dec0448b9eab

Modified:
/glu/lib/seqlib/bed.py
/glu/modules/seq/filter.py

=======================================
--- /glu/lib/seqlib/bed.py Sun Nov 6 04:35:39 2011
+++ /glu/lib/seqlib/bed.py Fri Jun 22 10:19:41 2012
@@ -41,7 +41,7 @@

-def read_features(filename,merge=True):
+def read_features(filename,merge=True,pad=0):
features = defaultdict(list)

if not filename:
@@ -63,7 +63,17 @@
else:
name = next(generic_names)

- features[contig].append( (int(start),int(end),name) )
+ start = int(start)
+ end = int(end)
+
+ if start>end:
+ start,end = end,start
+
+ if pad:
+ start -= pad
+ end -= pad
+
+ features[contig].append( (start,end,name) )

for contig in features:
if merge:
=======================================
--- /glu/modules/seq/filter.py Sun Apr 8 12:46:26 2012
+++ /glu/modules/seq/filter.py Fri Jun 22 10:19:41 2012
@@ -530,6 +530,8 @@
help='Minimum read length filter')
parser.add_argument('--targets', metavar='BED',
help='Single track BED file containing all targeted
intervals')
+ parser.add_argument('--padtargets', metavar='N', default=0,
+ help='Pad each target with N base pairs up- and
down-stream (default=0)')
parser.add_argument('--controls', metavar='CONTIGS',
help='List of control contigs to allow those aligned '
'reads to be counted seperately from other
aligned reads '
@@ -613,7 +615,7 @@
aligns = contig_stats(aligns,references,options.contigstats)

if options.targets:
- targets = read_features(options.targets)
+ targets =
read_features(options.targets,merge=True,pad=options.padtargets)
aligns =
target_filter(aligns,references,targets,controls,stats,options)
else:
aligns = simple_filter(aligns,controls,stats,options)

==============================================================================
Revision: df2cae6051ce
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:47:00 2012
Log: Remove spaces from annotations, since some VCF parsers are
painfully literal
about the "spec".
http://code.google.com/p/glu-genetics/source/detail?r=df2cae6051ce

Modified:
/glu/lib/seqlib/vannotator.py

=======================================
--- /glu/lib/seqlib/vannotator.py Sun Apr 22 13:24:21 2012
+++ /glu/lib/seqlib/vannotator.py Fri Jul 20 09:47:00 2012
@@ -260,10 +260,10 @@
three_prime.add(gene)

for gene in five_prime:
- evidence.append( ["5' of
gene",gene,'',False,'','',ref_nuc,var_nuc,'',''] )
+ evidence.append(
['UPSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

for gene in three_prime:
- evidence.append( ["3' of
gene",gene,'',False,'','',ref_nuc,var_nuc,'',''] )
+ evidence.append(
['DOWNSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

if not evidence:
evidence.append(
['intergenic','','',False,'','',ref_nuc,var_nuc,'',''] )
@@ -300,7 +300,7 @@
for splice in gene_parts.find_values(ref_start-5,ref_end+5):
if splice.type=='CDS' or 'UTR' in splice.type:
if (0<splice.start-ref_end<=5) or (0<ref_start-splice.end<=5):
- mut_type.add('POSSIBLE INTRONIC SPLICE VARIANT')
+ mut_type.add('POSSIBLE_INTRONIC_SPLICE_VARIANT')

parts = ','.join(sorted(parts))
mut_type = ','.join(sorted(mut_type))
@@ -352,9 +352,9 @@
mut_type = []

if exon_start<5:
- mut_type.append("POSSIBLE 5' EXONIC SPLICE VARIANT")
+ mut_type.append('POSSIBLE-SPLICE5')
if cds.end-exon_end<5:
- mut_type.append("POSSIBLE 3' EXONIC SPLICE VARIANT")
+ mut_type.append('POSSIBLE-SPLICE3')

if ref_frame!=var_frame:
mut_type.append('FRAMESHIFT')
@@ -405,9 +405,9 @@
ref_cds_aa = ref_cds.translate()
var_cds_aa = var_cds.translate()
except TranslationError:
- mut_type.append('INVALID TRANSLATION')
+ mut_type.append('INVALID_TRANSLATION')
mut_type = ','.join(sorted(mut_type))
- result += [True,'PRESUMED
NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,'','']
+ result +=
[True,'PRESUMED_NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,'','']
return result

ref_aa,var_aa,aa_position =
reduce_match(str(ref_cds_aa),str(var_cds_aa))
@@ -442,9 +442,9 @@

if ref_stop==-1:
if var_stop!=-1 and not v.startswith(r):
- mut_type.append('PREMATURE STOP')
+ mut_type.append('PREMATURE_STOP')
elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*':
- mut_type.append('LOSS OF STOP')
+ mut_type.append('LOSS_OF_STOP')

if len(r)==len(v):
mut_type.append('SUBSTITUTION')

==============================================================================
Revision: da5128386068
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:47:41 2012
Log: Perform slightly more intelligent chromosome name normalization
http://code.google.com/p/glu-genetics/source/detail?r=da5128386068

Modified:
/glu/lib/illumina.py

=======================================
--- /glu/lib/illumina.py Mon Aug 1 09:42:08 2011
+++ /glu/lib/illumina.py Fri Jul 20 09:47:41 2012
@@ -286,6 +286,9 @@

norm_ids[snpnum-1] = norm_id = norm_id + 100*assay_type_id + 1

+ if chromosome=='0':
+ chromosome = ''
+
yield
ManifestRow(ilmnid,name,design_strand,alleles,assay_type_id,norm_id,

addressA_id,alleleA_probe_sequence,addressB_id,alleleB_probe_sequence,
genome_version,chromosome,mapinfo,ploidy,species,
@@ -421,7 +424,7 @@
gstrand = assay[assayid_idx].split('_')[-2]

if targetstrand in ('real_forward','real_reverse'):
- gstrand = assay[gstrand_idx] or 'U'
+ gstrand = gstrandmap.get(assay[gstrand_idx]) or 'U'

if chr.startswith('chr'):
chr = chr[3:]
@@ -560,23 +563,23 @@
manifest = iter(manifest)
header = manifest.next()
name_idx = find_index(header,['Name'])
- chr_idx = find_index(header,['Chr'])
+ chrom_idx = find_index(header,['Chr'])
loc_idx = find_index(header,['MapInfo'])

for assay in manifest:
lname = assay[name_idx]
- chr = assay[chr_idx] or None
+ chrom = assay[chrom_idx] or None
loc = assay[loc_idx]

- if chr.startswith('chr'):
- chr = chr[3:]
- if chr=='Mt':
- chr='M'
+ if chrom and chrom.startswith('chr'):
+ chrom = chrom[3:]
+ if chrom=='Mt':
+ chrom='M'

if loc:
loc = int(loc)

- yield lname,chr,loc
+ yield lname,chrom,loc

def read_Illumina_IDAT(filename):

==============================================================================
Revision: 445887b7ae8e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:48:31 2012
Log: Minor fixes for multi-allelic variants and added some optional
filters that
are currently commented out.
http://code.google.com/p/glu-genetics/source/detail?r=445887b7ae8e

Modified:
/glu/modules/seq/vcf_summary.py

=======================================
--- /glu/modules/seq/vcf_summary.py Mon Mar 12 17:55:54 2012
+++ /glu/modules/seq/vcf_summary.py Fri Jul 20 09:48:31 2012
@@ -52,9 +52,21 @@
ex_known = np.zeros(n, dtype=int)
ge_known = np.zeros(n, dtype=int)

- variant_genos = set(['1/0','0/1','1/1'])
+ # Bloody double negatives
+ not_variant = set(['.','0','0/0','./0','0/.','./.'])
+
+ bad = set(['SegDup','Repeat'])

for v in vcf:
+ #if bad.intersection(v.filter):
+ # continue
+
+ #if 'SegDup' in v.filter or 'Repeat' in v.filter:
+ # continue
+
+ #if 'SegDup' not in v.filter and 'Repeat' not in v.filter:
+ # continue
+
#if float(v.qual)<30:
# continue

@@ -66,7 +78,7 @@
#if missing.sum()>n/2:
# continue

- variant = np.fromiter( (g[0] in variant_genos for g in v.genos),
dtype=bool, count=n )
+ variant = np.fromiter( (g[0] not in not_variant for g in v.genos),
dtype=bool, count=n )

if not variant.sum():
continue
@@ -80,7 +92,7 @@

infomap[key] = value

- if not v.names and int(infomap.get('REFVAR_OUTGROUP_COUNT',0))>50:
+ if not v.names and int(infomap.get('REFVAR_OUTGROUP_COUNT',0))>80:
continue

if v.names:

==============================================================================
Revision: c3b28309d596
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:48:49 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=c3b28309d596

Modified:
/glu/modules/seq/vcf2table.py

=======================================
--- /glu/modules/seq/vcf2table.py Sun Apr 8 12:47:09 2012
+++ /glu/modules/seq/vcf2table.py Fri Jul 20 09:48:49 2012
@@ -10,8 +10,6 @@

import sys

-from operator import itemgetter
-
from glu.lib.utils import unique
from glu.lib.fileutils import table_writer, table_reader,
list_reader, autofile, hyphen
from glu.lib.progressbar import progress_loop

==============================================================================
Revision: 158c4d06e062
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:49:16 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=158c4d06e062

Modified:
/glu/modules/seq/query_var.py

=======================================
--- /glu/modules/seq/query_var.py Thu Dec 22 08:28:43 2011
+++ /glu/modules/seq/query_var.py Fri Jul 20 09:49:16 2012
@@ -10,8 +10,6 @@

import sys

-from operator import itemgetter
-
from glu.lib.utils import unique
from glu.lib.fileutils import table_writer, table_reader,
autofile, hyphen
from glu.lib.progressbar import progress_loop

==============================================================================
Revision: 568a4641e874
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:50:13 2012
Log: Add more models and use VCFWriter infrastructure
http://code.google.com/p/glu-genetics/source/detail?r=568a4641e874

Modified:
/glu/modules/seq/filtervcf.py

=======================================
--- /glu/modules/seq/filtervcf.py Mon Mar 12 17:56:32 2012
+++ /glu/modules/seq/filtervcf.py Fri Jul 20 09:50:13 2012
@@ -12,82 +12,122 @@
import time
import sys

-from collections import defaultdict
-
-from glu.lib.fileutils import table_writer, table_reader,
autofile, hyphen, map_reader
+from glu.lib.fileutils import table_reader
from glu.lib.progressbar import progress_loop

-from glu.lib.seqlib.edits import reduce_match
-from glu.lib.seqlib.intervaltree import IntervalTree
+from glu.lib.seqlib.vcf import VCFReader, VCFWriter

def build_segmodel(model,samples,phenofile=None):
- phenomap = None
+ constraintmap = {}
+
if phenofile:
phenos = table_reader(options.phenos)
- phenomap = {}
for row in phenos:
if len(row)>1 and row[0] and row[1]:
sample = row[0]
constraint = row[1]
cost = int(row[2] or 1) if len(row)>2 else 1
- phenomap[sample] = constraint,cost
-
- segmodel = []
+ score = int(row[3] or 0) if len(row)>3 else 0
+ constraintmap[sample] = constraint,cost,score
+
+ segmodel = []
+ maxcost = 1
+ minscore = 0

if model:
- if phenomap is None:
- phenomap = {}
- for sample in samples:
- code = sample[-1]
- if code in 'ACU':
- phenomap[sample] = code
-
- if model=='rec':
- cost = 1
+ phenomap = {}
+ for sample in samples:
+ code = sample[-1]
+ if code in 'ACU':
+ phenomap[sample] = code
+
+ if model=='aff2':
+ cost = 0
+ score = 0.99
+ minscore = 1
+ for sample,code in phenomap.items():
+ if code in 'AC':
+ constraintmap[sample] = 'VAR',cost,score
+
+ elif model=='rec':
+ cost = 1
+ score = 0
for sample,code in phenomap.items():
if code=='A':
- phenomap[sample] = 'HOM',cost
+ constraintmap[sample] = 'HOM',cost,score
elif code=='C':
- phenomap[sample] = 'VAR',cost
+ constraintmap[sample] = 'VAR',cost,score
elif code=='U':
- phenomap[sample] = 'REF',cost
+ constraintmap[sample] = 'REF',cost,score

else:
- cost = 1 if model!='domm1' else 0.999999
+ cost = 1 if model!='domm1' else 0.999999
+ score = 0

for sample,code in phenomap.items():
if code in 'AC':
- phenomap[sample] = 'VAR',cost
+ constraintmap[sample] = 'VAR',cost,score
elif code=='U':
- phenomap[sample] = 'REF',1
-
- segmodel = [ (i,)+phenomap[s] for i,s in enumerate(samples) if s in
phenomap ]
-
- return segmodel
+ constraintmap[sample] = 'REF',1,score
+
+ segmodel = [ (i,)+constraintmap[s] for i,s in enumerate(samples) if s
in constraintmap ]
+
+ return maxcost,minscore,segmodel

-def check_segmodel(segmodel,genos,maxfail=1):
- fail = 0
- for i,constraint,cost in segmodel:
+def check_segmodel(segmodel,genos):
+ maxcost,minscore,constraints = segmodel
+ cost = score = 0
+
+ for i,constraint,c_cost,c_score in constraints:
g = genos[i]
- if constraint=='VAR' and g not in ('0/1','1/0','1/1'):
- fail += cost
- elif constraint=='HOM' and g!='1/1':
- fail += cost
- elif constraint=='HET' and g not in ('0/1','1/0'):
- fail += cost
- elif constraint=='REF' and g!='0/0':
- fail += cost
- elif constraint=='NOTREF' and g=='0/0':
- fail += cost
- elif constraint=='MISS' and g!='./.':
- fail += cost
-
- if fail>=maxfail:
+
+ if g=='.':
+ a=b='.'
+ else:
+ a,b = g.split('/')
+
+ if constraint=='VAR':
+ match = a not in '0.' or b not in '0.'
+ elif constraint=='HOM':
+ match = a==b and a not in '0.'
+ elif constraint=='HET':
+ match = a!=b and a not in '0.' and b not in '0.'
+ elif constraint=='REF':
+ match = a==b=='0'
+ elif constraint=='NOTREF':
+ match = a not in '.0' and b not in '.0'
+ elif constraint=='MISS':
+ match = a==b=='.'
+ elif constraint=='NOTMISS':
+ match = a!='.' and b!='.'
+ else:
+ raise ValueError('Unknown constraint: %s' % constraint)
+
+ if match:
+ score += c_score
+ else:
+ cost += c_cost
+
+ if cost>=maxcost:
return False

- return True
+ return score>=minscore
+
+
+def build_infomap(v):
+ infomap = {}
+ for inf in v.info:
+ if '=' in inf:
+ key,value = inf.split('=',1)
+ else:
+ key,value = inf,''
+
+ infomap[key] = value
+
+ return infomap
+

def option_parser():
@@ -111,6 +151,9 @@
help='Mapping from individual ID to phenotype state')
parser.add_argument('--model', metavar='NAME', default='',
help='Segregation model (dom,rec)')
+ parser.add_argument('--maxrefvar', metavar='RATE', type=int, default=0,
+ help='Maximum number of variant outgroup samples
(unlimited=0, default)')
+
parser.add_argument('-o', '--output', metavar='FILE', default='-',
help='Output variant file')
return parser
@@ -121,9 +164,21 @@
options = parser.parse_args()

filename = options.variants
- variants = table_reader(filename,hyphen=sys.stdin)
- out = autofile(hyphen(options.output,sys.stdout),'w')
+ vcf =
VCFReader(options.variants,hyphen=sys.stdin,normalize_indels=False)
+ out = VCFWriter(options.output, vcf.metadata, vcf.samples)
model = options.model.lower() if not options.phenos else 'custom'
+ segmodel = build_segmodel(model,vcf.samples,options.phenos)
+
+ if segmodel[2]:
+ print
+ print '-'*80
+ print 'MAX COST =',segmodel[0]
+ print 'MIN SCORE =',segmodel[1]
+ print 'GENETIC MODEL CONSTRAINTS:'
+ for i,code,cost,score in segmodel[2]:
+ print ' %s: %s (cost=%f, score=%f)' %
(vcf.samples[i],code,cost,score)
+ print '-'*80
+ print

includefilter = set()
for opts in options.includefilter or []:
@@ -133,74 +188,31 @@
for opts in options.excludefilter or []:
excludefilter |= set(o.strip().upper() for o in opts.split(','))

- for row in variants:
- if not row or row[0].startswith('##'):
- out.write('\t'.join(row))
- out.write('\n')
+ for v in vcf:
+ if options.includeid and not v.names:
continue

- elif row[0].startswith('#'):
- out.write('\t'.join(row))
- out.write('\n')
- header = list(row)
- header[0] = header[0][1:]
-
- samples = [ s.split('.')[0] for s in header[9:] ]
- segmodel = build_segmodel(model,samples,options.phenos)
-
- if segmodel:
- print
- print '-'*80
- print 'GENETIC MODEL CONSTRAINTS:'
- for i,code,cost in segmodel:
- print ' %s: %s (cost=%f)' % (samples[i],code,cost)
- print '-'*80
- print
-
+ if options.excludeid and v.names:
continue

- chrom = row[0]
- end = int(row[1])
- start = end-1
- names = row[2]
-
- if names=='.':
- names = []
- else:
- names = names.split(',')
-
- if options.includeid and not names:
- continue
-
- if options.excludeid and names:
- continue
-
- ref = row[3]
- var = row[4].split(',')[0]
-
- filter = row[6]
-
- if filter=='.':
- filter = []
- else:
- filter = filter.split(';')
-
if includefilter or excludefilter:
- normfilter = set(f.upper() for f in filter)
+ normfilter = set(f.upper() for f in v.filter)
if includefilter and
len(normfilter&includefilter)!=len(includefilter):
continue

if excludefilter and (normfilter&excludefilter):
continue

- info = row[7].split(';')
- genos = [ g.split(':')[0] for g in row[9:] ]
-
+ if options.maxrefvar>0:
+ info = build_infomap(v)
+ if int(info.get('REFVAR_OUTGROUP_COUNT',0))>options.maxrefvar:
+ continue
+
+ genos = [ g[0] for g in v.genos ]
if not check_segmodel(segmodel,genos):
continue

- out.write('\t'.join(row))
- out.write('\n')
+ out.write_locus(v)

if __name__=='__main__':

==============================================================================
Revision: d8260c256066
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:51:09 2012
Log: Better handle compressed input files
http://code.google.com/p/glu-genetics/source/detail?r=d8260c256066

Modified:
/glu/modules/seq/convert.py

=======================================
--- /glu/modules/seq/convert.py Sat Jul 30 20:35:40 2011
+++ /glu/modules/seq/convert.py Fri Jul 20 09:51:09 2012
@@ -76,14 +76,15 @@

def sequence_writer(filename, outformat=None):
outformat = outformat or guess_outformat(filename) or 'fasta'
+
+ if outformat.endswith('.gz'):
+ outformat = outformat[:-3]
+
outformat = FORMAT_REMAP.get(outformat,outformat)
filename = parse_augmented_filename(filename, {})
outfile = autofile(hyphen(filename,sys.stdout),'wb')

- try:
- return SeqIO._FormatToWriter[outformat](outfile)
- finally:
- outfile.close()
+ return SeqIO._FormatToWriter[outformat](outfile)

def read_sequence(filename, informat=None):
@@ -97,6 +98,13 @@
return SeqIO.parse(autofile(filename,'rb'), informat)

+def guess_sequence_format(filename, informat=None):
+ informat = informat or guess_informat(filename)
+ informat = FORMAT_REMAP.get(informat,informat)
+ filename = parse_augmented_filename(filename, {})
+ return format,filename
+
+
def trim_quality(seq,q):
import numpy as np

==============================================================================
Revision: 69683a256021
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:52:06 2012
Log: Add support for IonTorrent variant output files
http://code.google.com/p/glu-genetics/source/detail?r=69683a256021

Modified:
/glu/modules/seq/annotate.py

=======================================
--- /glu/modules/seq/annotate.py Sun Apr 22 13:23:42 2012
+++ /glu/modules/seq/annotate.py Fri Jul 20 09:52:06 2012
@@ -15,8 +15,9 @@
import pysam

from glu.lib.utils import unique
-from glu.lib.fileutils import list_reader, autofile, hyphen
+from glu.lib.fileutils import list_reader, autofile, hyphen,
table_reader, table_writer
from glu.lib.progressbar import progress_loop
+from glu.lib.recordtype import recordtype

from glu.lib.genedb.queries import query_segdups, query_repeats

@@ -259,7 +260,7 @@

if kaviar:
kinfo = kaviar.get(v.chrom,v.start) or []
- ktext = [ stuff.replace(';',',') for
chrom,loc,mallele,maf,allele,stuff in kinfo if allele in v.var ]
+ ktext = [ stuff.replace(';',',').replace(' ','_') for
chrom,loc,mallele,maf,allele,stuff in kinfo if allele in v.var ]

if ktext:
v.filter.append('Kaviar')
@@ -365,7 +366,6 @@
metadata.setdefault('FILTER',[])
metadata.setdefault('INFO',[])

-
metadata['FILTER'].append('##FILTER=<ID=PartiallyCalled,Description="Variant
is not called for one or more samples">')
metadata['FILTER'].append('##FILTER=<ID=Indel,Description="Variant is an
insertion or deletion">')
metadata['FILTER'].append('##FILTER=<ID=Intergenic,Description="Variant
not in or near a gene">')
metadata['FILTER'].append('##FILTER=<ID=NPF,Description="Variant is not
predicted to alter a protein">')
@@ -422,6 +422,296 @@
out.write_locus(v)

+def update_ion_annotation(v, vs, cv, esp, kaviar, refvars, polyphen2,
options):
+ new_info = []
+
+ if vs:
+ # FIXME: Order genes and evidence consistently
+ evidence = list(vs.annotate(v.chrom, v.start, v.end, v.var,
nsonly=False))
+ #v.names = sorted(set(str(v) for e in evidence for v in
e.varid_exact)|set(v.names))
+ cytoband = sorted(set(e.cytoband for e in evidence if e.cytoband))
+ genes = sorted(set(e.gene.symbol for e in evidence if e.gene and
e.gene.symbol))
+ geneids = sorted(set(e.gene.geneid for e in evidence if e.gene and
e.gene.geneid))
+ location = sorted(set(e.intersect for e in evidence if
e.intersect))
+ function = sorted(set(e.func_type or e.func_class for e in evidence
if e.func))
+ nsevidence = [ e for e in evidence if e.func ]
+ nsinfo = ( '%s:%s:%s:%s' %
(e.gene.symbol,e.func_type,e.details,aa_change(e))
+ for e in nsevidence )
+ nsinfo = list(unique(nsinfo))
+
+ segdups = query_segdups(vs.con, v.chrom, v.start, v.end)
+ repeats = query_repeats(vs.con, v.chrom, v.start, v.end)
+
+ if not genes:
+ v.Intergenic = 'Y'
+
+ if not nsevidence:
+ v.NPF = 'Y'
+
+ if cytoband:
+ v.CYTOBAND = ','.join(cytoband)
+
+ if genes:
+ v.GENE_NAME = ','.join(genes)
+
+ if geneids:
+ v.GENE_ID = ','.join(str(g) for g in geneids)
+
+ if location:
+ v.GENE_LOCATION = ','.join(location)
+
+ if function:
+ v.GENE_FUNCTION = ','.join(function)
+
+ if nsinfo:
+ v.GENE_FUNCTION_DETAILS = ','.join(nsinfo)
+
+ if segdups:
+ segdup_info = ('chr%s:%s-%s:%.2f' %
(s.other_chrom,s.other_start,s.other_stop,s.matchFraction*100) for s in
segdups)
+
+ v.SegDup = 'Y'
+ v.SEGDUP_COUNT = len(segdups)
+ v.SEGDUP_INFO = ','.join(segdup_info)
+
+ if repeats:
+ repeat_info = ('%s:%s:%s' %
(r.repeatName,r.repeatClass,r.repeatFamily) for r in repeats)
+
+ v.Repeat = 'Y'
+ v.REPEAT_COUNT = len(repeats)
+ v.REPEAT_INFO = ','.join(repeat_info)
+
+ if cv:
+ cvinfo = cv.score_and_classify(v.chrom,v.start,v.end,[v.ref,v.var])
+
+ if cvinfo.exact_vars:
+ if 'tgp' in cvinfo.exact_vars:
+ v.TGP = 'Y'
+
+ vars = set(v.replace('dbsnp:','rs') for v in cvinfo.exact_vars)
+ vars.discard('tgp')
+
+ v.VAR_NAME = ','.join(sorted(vars))
+
+ if cvinfo.exact_vars or cvinfo.common_score>0:
+ v.COMMON_SCORE = '%.2f' % cvinfo.common_score
+
+ if cvinfo.function_info:
+ v.FUNCTION_INFO = ','.join(cvinfo.function_info)
+
+ if cvinfo.inexact_vars:
+ v.INEXACT_VARIANTS = ','.join(sorted(set(cvinfo.inexact_vars)))
+
+ if cvinfo.common_score>options.commonscore:
+ v.Common = 'Y'
+
+ if esp:
+ chrom = v.chrom
+ vars = set(v.var)
+ if chrom.startswith('chr'):
+ chrom = chrom[3:]
+
+ try:
+ esp_info = esp.fetch(chrom,v.start,v.end)
+ esp_info = [ e for e in esp_info if e.start==v.start and
e.end==v.end and vars.intersection(e.var) ]
+ except ValueError:
+ esp_info = None
+
+ if esp_info:
+ gtc = maf_ea = maf_aa = 0
+
+ for einfo in esp_info:
+ infomap = make_infomap(einfo.info)
+ if 'MAF' in infomap:
+ mafs = map(float,infomap['MAF'].split(','))
+ maf_ea = max(maf_ea,mafs[0]/100)
+ maf_aa = max(maf_aa,mafs[1]/100)
+
+ if 'GTC' in infomap:
+ gtcs = map(int,infomap['GTC'].split(','))
+ gtc = max(gtc,gtcs[0]+gtcs[1])
+
+ maf = max(maf_ea,maf_aa)
+
+ v.ESP = 'Y'
+ if maf>options.commonscore:
+ v.ESPCommon = 'Y'
+
+ v.ESP_COUNT = gtc
+ v.ESP_MAF_EA = '%.4f' % maf_ea
+ v.ESP_MAF_AA = '%.4f' % maf_aa
+
+ if kaviar:
+ kinfo = kaviar.get(v.chrom,v.start) or []
+ ktext = [ stuff.replace(';',',').replace(' ','_') for
chrom,loc,mallele,maf,allele,stuff in kinfo if allele in v.var ]
+
+ if ktext:
+ v.Kaviar = 'Y'
+
+ mallele = kinfo[0][2]
+ maf = kinfo[0][3]
+
+ if mallele and mallele in v.var:
+ v.KAVIAR_MAF = '%.2f' % maf
+
+ if maf>options.commonscore:
+ v.KaviarCommon = 'Y'
+
+ v.KAVIAR_NAMES = ','.join(ktext)
+
+
+ if refvars:
+ ingroup,outgroup = refvars.get(v.chrom,v.start,v.end,v.var) if refvars
else ([],[])
+
+ v.REFVAR_INGROUP_COUNT = len(ingroup)
+ v.REFVAR_OUTGROUP_COUNT = len(outgroup)
+
+ if ingroup:
+ v.REFVAR_INGROUP_NAMES = ','.join(ingroup)
+
+ if outgroup:
+ v.RefVar = 'Y'
+ v.REFVAR_OUTGROUP_NAMES = ','.join(outgroup)
+
+ if vs and polyphen2 and v.end-v.start==1 and 'CDS' in location:
+ pmap = {'A':0,'C':1,'G':2,'T':3}
+
+ try:
+ pvars = [ p.rstrip().split('\t') for p in
polyphen2.fetch(v.chrom,v.start,v.end) ]
+ except ValueError:
+ pvars = None
+
+ if pvars:
+ hdivs = []
+ hvars = []
+
+ for a in [v.var]:
+ if a in pmap:
+ i = pmap[a]
+
+ hdiv = hvar = ''
+ for p in pvars:
+ hdiv += p[3][i]
+ hvar += p[4][i]
+
+ hdiv = polyphen2_code(hdiv)
+ hvar = polyphen2_code(hvar)
+ else:
+ hdiv = '.'
+ hvar = '.'
+
+ #assert hdiv!='r' and hvar!='r'
+
+ hdivs.append(hdiv)
+ hvars.append(hvar)
+
+ if hdivs.count('.')!=len(hdivs):
+ v.POLYPHEN2_HDIV = ','.join(hdivs)
+ if hvars.count('.')!=len(hvars):
+ v.POLYPHEN2_HVAR = ','.join(hvars)
+
+
+ if not v.ref or not v.var:
+ v.Indel = 'Y'
+
+ return v
+
+
+def annotate_ion(options):
+ vars = table_reader(options.variants,sys.stdin)
+ header = next(vars)
+
+ fields = ['Chrom','Position','Ref','Variant']
+
+ try:
+ indices = [ header.index(f) for f in fields ]
+ except ValueError:
+ missing = ', '.join(f for f in fields if f not in header)
+ raise ValueError('Invalid IonTorrent Variant file. Missing
headers: %s' % missing)
+
+ var_fields = itemgetter(*indices)
+
+ vs = VariantAnnotator(options.genedb, options.reference)
+ cv = CGFVariants(options.cgfvariants, options.reference) if
options.cgfvariants else None
+ esp = VCFReader(options.esp) if options.esp else None
+ polyphen2= pysam.Tabixfile(options.polyphen2) if options.polyphen2 else
None
+
+ if options.kaviar:
+ references = list_reader(options.reference+'.fai')
+ kaviar = kaviar_reader(options.kaviar)
+ kaviar = OrderedReader(kaviar, references)
+ else:
+ kaviar = None
+
+ refvars = ReferenceVariants(options.refvars,options.refingroup) if
options.refvars else None
+
+ filters = ['Indel','Intergenic','NPF','SegDup','Repeat']
+ info =
['VAR_NAME','CYTOBAND','GENE_NAME','GENE_ID','GENE_LOCATION','GENE_FUNCTION',
+ 'GENE_FUNCTION_DETAILS','SEGDUP_COUNT','SEGDUP_INFO','REPEAT_COUNT','REPEAT_INFO']
+
+ if cv:
+ filters += ['Common','TGP']
+ info += ['COMMON_SCORE','FUNCTION_INFO','INEXACT_VARIANTS']
+
+
+ if esp:
+ filters += ['ESP', 'ESPCommon']
+ info += ['ESP_COUNT','ESP_MAF_EA','ESP_MAF_AA']
+
+ if kaviar:
+ filters += ['Kaviar','KaviarCommon']
+ info += ['KAVIAR_MAF','KAVIAR_NAMES']
+
+ if refvars:
+ filters += ['RefVar']
+ info +=
['REFVAR_INGROUP_COUNT','REFVAR_OUTGROUP_COUNT','REFVAR_INGROUP_NAMES','REFVAR_OUTGROUP_NAMES']
+
+ if polyphen2:
+ info += ['POLYPHEN2_HDIV','POLYPHEN2_HVAR']
+
+ new_fields = set(filters+info)
+ keep_fields = itemgetter( *(i for i,h in enumerate(header) if h and h
not in new_fields ) )
+
+ new_fields = filters+info
+ blanks = ['']*len(new_fields)
+ keep_header = list(keep_fields(header))
+ keep_len = len(keep_header)
+ new_header = keep_header+filters+info
+ Var = recordtype('Var',
['chrom','start','end','ref','var']+filters+info)
+ out = table_writer(options.output,hyphen=sys.stdout)
+
+ out.writerow(new_header)
+
+
+ for rec in vars:
+ chrom,pos,ref,var = var_fields(rec)
+
+ pos = int(pos)
+ start = pos-1
+ end = pos
+
+ # Fix indels encoded in the absurd VCF-like manner
+ if ref and var and ref[0]==var[0]:
+ ref = ref[1:]
+ var = var[1:]
+ start += 1
+ end += 1
+
+ rec[indices[1]] = pos+1
+ rec[indices[2]] = ref
+ rec[indices[3]] = var
+
+ v = Var( *([chrom,start,end,ref,var]+blanks) )
+ update_ion_annotation(v, vs, cv, esp, kaviar, refvars, polyphen2,
options)
+
+ new_rec = list(keep_fields(rec))
+ if len(new_rec)<keep_len:
+ new_rec += ['']*(keep_len-len(new_rec))
+
+ new_rec += list(v[5:])
+
+ out.writerow(new_rec)
+
+
def valid_allele(a):
return a is not None and a!='=' and 'N' not in a and '?' not in a

@@ -517,7 +807,7 @@
parser.add_argument('variants', help='Input variant file')

parser.add_argument('-f', '--format', metavar='NAME', default='VCF',
- help='File format (VCF)')
+ help='File format (VCF, ION)')
parser.add_argument('-g', '--genedb', metavar='NAME',
help='Genedb genome annotation database name or
file')
parser.add_argument('-r', '--reference', metavar='NAME', required=True,
@@ -546,8 +836,10 @@
options = parser.parse_args()
format = options.format.upper()

- if format=='VCF': # *** VCF, SNPs only ***
+ if format=='VCF': # *** VCF ***
annotate_vcf(options)
+ if format=='ION':
+ annotate_ion(options)
elif format=='MASTERVAR':
annotate_mastervar(options)
else:

==============================================================================
Revision: 7a5a01e793b2
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:52:28 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=7a5a01e793b2

Modified:
/glu/modules/seq/amplicon_coverage.py

=======================================
--- /glu/modules/seq/amplicon_coverage.py Sat Nov 19 06:22:12 2011
+++ /glu/modules/seq/amplicon_coverage.py Fri Jul 20 09:52:28 2012
@@ -12,7 +12,7 @@
import sys

from collections import deque
-from operator import attrgetter, itemgetter
+from operator import attrgetter
from itertools import groupby, imap, izip

import pysam

==============================================================================
Revision: 53a419da4d43
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:54:30 2012
Log: Cleanups and fixes to allow better filtering of samples and use of
non-renormalized data.
http://code.google.com/p/glu-genetics/source/detail?r=53a419da4d43

Modified:
/glu/modules/cnv/gdat.py
/glu/modules/cnv/index_gdats.py
/glu/modules/cnv/make_gcmodel.py
/glu/modules/cnv/plot.py
/glu/modules/cnv/plot_events.py
/glu/modules/cnv/plot_gdat.py
/glu/modules/cnv/stats.py

=======================================
--- /glu/modules/cnv/gdat.py Wed Jan 11 05:05:55 2012
+++ /glu/modules/cnv/gdat.py Fri Jul 20 09:54:30 2012
@@ -1,4 +1,6 @@
+import os
import sys
+
import h5py
import sqlite3
import numpy as np
@@ -88,7 +90,7 @@

if self.format_found!='gdat':
raise ValueError('Input file "%s" does not appear to be in %s
format. Found %s.' \
- % (namefile(filename),format,self.format_found))
+ % (namefile(filename),'gdat',self.format_found))

if self.gdat_version!=1:
raise ValueError('Unknown gdat file version: %s' % self.gdat_version)
@@ -214,6 +216,9 @@

yield sample,snps,genos[i],sample_lrr,sample_baf

+ def __contains__(self,item):
+ return item in self.gdat
+
def __getitem__(self,item):
return self.gdat[item]

@@ -225,7 +230,7 @@

def gdat_decoder(table,rows):
- if 'SCALE' not in table.attrs:
+ if not hasattr(table,'attrs') or 'SCALE' not in table.attrs:
return rows

def _decoder(table,rows):
@@ -238,6 +243,11 @@

def block_iter(table):
+ if not hasattr(table,'chunks'):
+ for row in table:
+ yield row
+ return
+
chunksize = table.chunks[0]
start = 0
last = len(table)
@@ -333,7 +343,7 @@
con.execute('PRAGMA journal_mode=OFF;')
con.execute('PRAGMA count_changes=OFF;')
con.execute('PRAGMA cache_size=200000;')
- con.execute('PRAGMA default_cache_size=200000;')
+ #con.execute('PRAGMA default_cache_size=200000;')

class GDATIndex(object):
@@ -410,7 +420,7 @@
def index(self,gdat):
self.clear_index(gdat)

- filename = gdat.filename
+ filename = os.path.abspath(gdat.filename)
manifest = gdat.attrs['ManifestName']
rows = [ (name,filename,i,manifest) for i,name in
enumerate(gdat.samples) ]

@@ -425,11 +435,16 @@

try:
gc = gcdata['GC'][:].T
+ gcmask = np.isfinite(gc.sum(axis=1))
+
+ #import scipy
+ #r2 = scipy.corrcoef(gc[gcmask],rowvar=0)
+ #from glu.lib.fileutils import table_writer
+ #table_writer('gc_cov.txt').writerows(r2)

#small = np.array([5,6,7,8,14,15,16,17],dtype=int)
#gc = gc[:,small]

- gcmask = np.isfinite(gc.sum(axis=1))
gcmeans = gc[gcmask].mean(axis=0)
gc -= gcmeans.reshape(1,-1)
n = len(gc)
=======================================
--- /glu/modules/cnv/index_gdats.py Sat Aug 20 11:23:49 2011
+++ /glu/modules/cnv/index_gdats.py Fri Jul 20 09:54:30 2012
@@ -34,6 +34,9 @@
except IOError:
print 'Unable to open GDAT file: %s' % filename
continue
+ except ValueError:
+ print 'Invalid GDAT file: %s' % filename
+ continue

manifest = gdat.attrs['ManifestName']

=======================================
--- /glu/modules/cnv/make_gcmodel.py Wed Jan 11 05:05:55 2012
+++ /glu/modules/cnv/make_gcmodel.py Fri Jul 20 09:54:30 2012
@@ -126,6 +126,10 @@
attrs['TermCount'] = n

for chrom,chrom_snps in groupby(allsnps, itemgetter(1)):
+ if not chrom:
+ sys.stderr.write('Skipping region: %s\n' % chrom)
+ continue
+
try:
seq = reference.fetch('chr'+chrom).upper()
except IndexError:
=======================================
--- /glu/modules/cnv/plot.py Sun Nov 6 13:45:25 2011
+++ /glu/modules/cnv/plot.py Fri Jul 20 09:54:30 2012
@@ -114,9 +114,12 @@
miss = genos==' '
homs = (~miss)&(~hets)

- sec.scatter(pos[homs], baf[homs], c='darkred', marker='o', s=2,
linewidths=0, alpha=0.8, zorder=4)
- sec.scatter(pos[hets], baf[hets], c='red', marker='o', s=2,
linewidths=0, alpha=0.5, zorder=5)
- sec.scatter(pos[miss], baf[miss], c='red', marker='o', s=2,
linewidths=0, alpha=0.8, zorder=6)
+ if homs.sum():
+ sec.scatter(pos[homs], baf[homs], c='darkred', marker='o', s=2,
linewidths=0, alpha=0.8, zorder=4)
+ if hets.sum():
+ sec.scatter(pos[hets], baf[hets], c='red', marker='o', s=2,
linewidths=0, alpha=0.5, zorder=5)
+ if miss.sum():
+ sec.scatter(pos[miss], baf[miss], c='red', marker='o', s=2,
linewidths=0, alpha=0.8, zorder=6)

main.set_xlabel('Chromosome Location (Mbps)')
main.set_ylabel('Log Intensity Ratio (LRR)')
=======================================
--- /glu/modules/cnv/plot_events.py Sun Nov 6 13:45:25 2011
+++ /glu/modules/cnv/plot_events.py Fri Jul 20 09:54:30 2012
@@ -27,6 +27,9 @@
from glu.modules.cnv.plot import plot_chromosome
from glu.modules.cnv.normalize import quantile

+from glu.lib.genolib.transform import GenoTransform
+
+
Event = recordtype('Event', 'chrom start stop fields lrr baf baf_model
state pmosaic')

@@ -135,7 +138,10 @@

mix = mixture.MixtureModel(len(dists), weights, dists)

- post,logL = mix.EM(dataset, 100, delta=0.001, silent=True)
+ try:
+ post,logL = mix.EM(dataset, 100, delta=0.001, silent=True)
+ except mixture.ConvergenceFailureEM:
+ return None

w = np.array(mix.pi, dtype=float)
mu = np.array([het.mu for het in hets], dtype=float)
@@ -171,6 +177,10 @@

t0 = time.time()
fits = [ fit_gmm4(c,baf) for c in range(0,max_baf+1) ]
+
+ if None in fits:
+ return None
+
AIC = np.array([ (20*c-2*f[0]) for c,f in enumerate(fits) ],
dtype=float)
best = AIC.argmin()

@@ -462,6 +472,11 @@
help='Segment start index column
name (default=SEG_START)')
parser.add_argument('--segstop', metavar='COLUMN', default='SEG_STOP',
help='Segment stop index column name
(default=SEG_STOP)')
+ parser.add_argument('--includesamples', metavar='FILE', action='append',
+ help='List of samples to include,
all others will be skipped')
+ parser.add_argument('--excludesamples', metavar='FILE', action='append',
+ help='List of samples to exclude,
only samples not present will be kept')
+
table_options(parser)

return parser
@@ -497,7 +512,21 @@
if out:
out.writerow(header)

+ transform = GenoTransform.from_object(options)
+
+ if transform is not None:
+ include = transform.samples.include
+ exclude = transform.samples.exclude
+ else:
+ include = exclude = None
+
for assay,assay_events in
groupby(events,key=attrgetter(options.assayid)):
+ if include is not None and assay not in include:
+ continue
+
+ if exclude is not None and assay in exclude:
+ continue
+
assay_events = list(assay_events)

locations = list(indexfile.get(assay))
=======================================
--- /glu/modules/cnv/plot_gdat.py Wed Jan 11 05:05:55 2012
+++ /glu/modules/cnv/plot_gdat.py Fri Jul 20 09:54:30 2012
@@ -14,15 +14,17 @@
import scipy
import scipy.stats

-from itertools import groupby
-from operator import itemgetter,attrgetter
-
-from collections import namedtuple
-
-from glu.lib.fileutils import table_reader, table_writer
-
-from glu.modules.cnv.gdat import GDATFile, get_gcmodel, gc_correct
-from glu.modules.cnv.plot import plot_chromosome
+from itertools import groupby
+from operator import itemgetter,attrgetter
+
+from collections import namedtuple
+
+from glu.lib.fileutils import table_reader, table_writer
+
+from glu.modules.cnv.gdat import GDATFile, get_gcmodel, gc_correct
+from glu.modules.cnv.plot import plot_chromosome
+
+from glu.lib.genolib.transform import GenoTransform

def option_parser():
@@ -43,6 +45,10 @@
help='Plot name template
(default={GDAT}_{ASSAY}_chr{CHROMOSOME}.png)')
parser.add_argument('--title', metavar='VAL', default='Intensity
plot of {GDAT}:{ASSAY} chr{CHROMOSOME}',
help="Plot name template
(default='Intensity plot of {GDAT}:{ASSAY} chr{CHROMOSOME}')")
+ parser.add_argument('--includesamples', metavar='FILE', action='append',
+ help='List of samples to include,
all others will be skipped')
+ parser.add_argument('--excludesamples', metavar='FILE', action='append',
+ help='List of samples to exclude,
only samples not present will be kept')

return parser

@@ -70,7 +76,21 @@
if not chromosomes:
chromosomes = sorted(c for c in chrom_index if c)

+ transform = GenoTransform.from_object(options)
+
+ if transform is not None:
+ include = transform.samples.include
+ exclude = transform.samples.exclude
+ else:
+ include = exclude = None
+
for offset,assay in enumerate(gdat.samples):
+ if include is not None and assay not in include:
+ continue
+
+ if exclude is not None and assay in exclude:
+ continue
+
print 'GDAT:',gdatname,assay
assay_id,genos,lrr,baf = gdat.cnv_data(offset,
raw=options.signal.lower()=='raw')
mask = np.isfinite(lrr)&(lrr>=-2)&(lrr<=2)
=======================================
--- /glu/modules/cnv/stats.py Tue Mar 27 08:17:35 2012
+++ /glu/modules/cnv/stats.py Fri Jul 20 09:54:30 2012
@@ -13,6 +13,7 @@
import sys

from math import modf
+from itertools import repeat

import numpy as np
import scipy.stats as stats
@@ -56,10 +57,19 @@
samples = gdat.samples
BAF_orig = gdat['BAF']
LRR_orig = gdat['LRR']
- BAF_norm = gdat['BAF_QN']
- LRR_norm = gdat['LRR_QN']
genotypes = gdat['Genotype']

+ if 'LRR_QN' in gdat:
+ BAF_norm = gdat['BAF_QN']
+ LRR_norm = gdat['LRR_QN']
+ else:
+ BAF_empty = np.empty( (BAF_orig.shape[1],), dtype=float )
+ LRR_empty = np.empty( (LRR_orig.shape[1],), dtype=float )
+ BAF_empty[:] = np.nan
+ LRR_empty[:] = np.nan
+ BAF_norm = repeat(BAF_empty)
+ LRR_norm = repeat(LRR_empty)
+
print 'samples=%d, SNPs=%d' % (n,s)

np.set_printoptions(linewidth=120,precision=2,suppress=True)

==============================================================================
Revision: 87c5a1f4e1f0
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Fri Jul 20 09:55:49 2012
Log: Improve VCF writer to accommodate parsers that are strict about
disallowing
whitespace in INFO records.
http://code.google.com/p/glu-genetics/source/detail?r=87c5a1f4e1f0

Modified:
/glu/lib/seqlib/vcf.py

=======================================
--- /glu/lib/seqlib/vcf.py Wed Apr 18 09:25:18 2012
+++ /glu/lib/seqlib/vcf.py Fri Jul 20 09:55:49 2012
@@ -24,7 +24,7 @@
def _intern(x,strcache=strcache.setdefault): return strcache(x,x)

-def _encode_vcf_records(rows):
+def _encode_vcf_records(rows,normalize_indels=True):
for row in rows:
chrom = _intern(row[0])
start = int(row[1])-1
@@ -51,27 +51,29 @@
format = genos = None

# VCF codes indels with an extra left reference base, which we strip
- r = ref[0]
- if all(a.startswith(r) for a in var):
- start += 1
- ref = _intern(ref[1:])
- var = [ _intern(v[1:]) for v in var ]
+ if normalize_indels:
+ r = ref[0]
+ if all(a.startswith(r) for a in var):
+ start += 1
+ ref = _intern(ref[1:])
+ var = [ _intern(v[1:]) for v in var ]

yield
VCFRecord(chrom,start,end,names,ref,var,qual,filter,info,format,genos)

class VCFReader(object):
- def __init__(self, filename, hyphen=None, field_size_limit=1024*1024):
+ def __init__(self, filename, hyphen=None, normalize_indels=True,
field_size_limit=1024*1024):
if csv.field_size_limit()<field_size_limit:
csv.field_size_limit(field_size_limit)

- self.filename = filename
- self.tabixfile = None
- self.data = data = table_reader(filename,hyphen=sys.stdin)
-
- self.metadata = metadata = OrderedDict()
- self.header = None
- self.samples = None
+ self.filename = filename
+ self.normalize_indels = normalize_indels
+ self.tabixfile = None
+ self.data = data = table_reader(filename,hyphen=sys.stdin)
+
+ self.metadata = metadata = OrderedDict()
+ self.header = None
+ self.samples = None

for row in data:
if not row:
@@ -99,7 +101,7 @@

def __iter__(self):
- return _encode_vcf_records(self.data)
+ return _encode_vcf_records(self.data,self.normalize_indels)

def fetch(self, chromosome, start, stop):
tabixfile = self.tabixfile
@@ -109,12 +111,12 @@

records = [ r.split('\t') for r in tabixfile.fetch(chromosome, start,
stop) ]

- return _encode_vcf_records(records)
+ return _encode_vcf_records(records,self.normalize_indels)

class VCFWriter(object):
- def __init__(self, filename, metadata, names, reference):
- self.reference = pysam.Fastafile(reference)
+ def __init__(self, filename, metadata, names, reference=None):
+ self.reference = pysam.Fastafile(reference) if reference is not None
else None
self.out = out = autofile(hyphen(filename,sys.stdout),'w')

for meta in metadata:
@@ -133,6 +135,9 @@

# VCF codes indels with an extra left reference base
if not vcfvar.ref or '' in vcfvar.var:
+ if self.reference is None:
+ raise RuntimeError('Reference sequence required to write VCF indel
loci')
+
pos -= 1
r = self.reference.fetch(vcfvar.chrom,pos-1,pos).upper()
ref = r+ref
@@ -148,6 +153,12 @@
';'.join(vcfvar.info),
vcfvar.format ] + [ ':'.join(g) for g in vcfvar.genos ]

- out = self.out
- out.write('\t'.join(map(str,row)))
+ out = self.out
+
+ text = '\t'.join(map(str,row))
+
+ if ' ' in text:
+ text = text.replace(' ','_')
+
+ out.write(text)
out.write('\n')

Reply all

Reply to author

Forward

0 new messages