3 new revisions:
Revision: d23122cec655
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:23:56 2012
Log: Add missing import of re
http://code.google.com/p/glu-genetics/source/detail?r=d23122cec655
Revision: 4d60bf0d3bcb
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:24:41 2012
Log: Make VCF genotyping parsing lazy
http://code.google.com/p/glu-genetics/source/detail?r=4d60bf0d3bcb
Revision: cc14d078fe18
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:27:40 2012
Log: Move lazy_property into lib.utils
http://code.google.com/p/glu-genetics/source/detail?r=cc14d078fe18
==============================================================================
Revision: d23122cec655
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:23:56 2012
Log: Add missing import of re
http://code.google.com/p/glu-genetics/source/detail?r=d23122cec655
Modified:
/glu/modules/seq/filtervcf.py
=======================================
--- /glu/modules/seq/filtervcf.py Sun Aug 5 04:34:07 2012
+++ /glu/modules/seq/filtervcf.py Sun Aug 5 11:23:56 2012
@@ -9,7 +9,7 @@
__revision__ = '$Id$'
-import time
+import re
import sys
from glu.lib.fileutils import table_reader
==============================================================================
Revision: 4d60bf0d3bcb
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:24:41 2012
Log: Make VCF genotyping parsing lazy
http://code.google.com/p/glu-genetics/source/detail?r=4d60bf0d3bcb
Modified:
/glu/lib/seqlib/vcf.py
=======================================
--- /glu/lib/seqlib/vcf.py Fri Jul 20 09:55:49 2012
+++ /glu/lib/seqlib/vcf.py Sun Aug 5 11:24:41 2012
@@ -6,49 +6,67 @@
__copyright__ = 'Copyright (c) 2010, BioInformed LLC and the U.S.
Department of Health & Human Services. Funded by NCI under Contract
N01-CO-12400.'
__license__ = 'See GLU license for terms by running: glu license'
-import csv
import sys
-from collections import defaultdict, OrderedDict
+from collections import OrderedDict
import pysam
-from glu.lib.fileutils import table_reader, autofile, hyphen
+from glu.lib import fileutils
from glu.lib.recordtype import recordtype
-VCFRecord = recordtype('VCFRecord', 'chrom start end names ref var qual
filter info format genos')
+VCFRecordBase = recordtype('VCFRecord', 'chrom start end names ref var
qual filter info format genostr genolist')
-strcache = {}
-def _intern(x,strcache=strcache.setdefault): return strcache(x,x)
-
-
-def _encode_vcf_records(rows,normalize_indels=True):
- for row in rows:
- chrom = _intern(row[0])
- start = int(row[1])-1
- names = row[2].split(';') if row[2]!='.' else []
- ref = _intern(row[3])
- end = start+len(ref)
- var = [ _intern(v) for v in row[4].split(',') ]
- qual = row[5]
- filter = [ _intern(f) for f in row[6].split(';') ] if row[6]!='.'
else []
- info = row[7].split(';') if row[7]!='.' else []
-
- n = len(row)
- if n>8:
- format = _intern(row[8])
-
- if n>9:
- if ':' in format:
- genos = [ g.split(':') for g in row[9:] ] if len(row)>9
else None
- else:
- genos = [ [g] for g in row[9:] ] if len(row)>9
else None
- else:
- genos = None
+class VCFRecord(VCFRecordBase):
+ __slots__ = ()
+
+ @property
+ def genos(self):
+ genos = self.genolist
+
+ if genos is not None:
+ return genos
+
+ format = self.format
+ genostr = self.genostr
+
+ fields = genostr.split('\t') if genostr else []
+
+ if not fields:
+ genos = fields
+ elif ':' in format:
+ genos = [ g.split(':') for g in fields ]
else:
- format = genos = None
+ genos = [ [g] for g in fields ]
+
+ self.genolist = genos
+
+ return genos
+
+
+strcache = {}
+def _intern(x,strcache=strcache.setdefault): return strcache(x,x)
+
+
+def _encode_vcf_records(data,normalize_indels=True):
+ for line in data:
+ line = line.rstrip()
+ fields = line.split('\t',9)
+ n = len(fields)
+
+ chrom = _intern(fields[0])
+ start = int(fields[1])-1
+ names = fields[2].split(';') if fields[2]!='.' else []
+ ref = _intern(fields[3])
+ end = start+len(ref)
+ var = [ _intern(v) for v in fields[4].split(',') ]
+ qual = fields[5]
+ filter = [ _intern(f) for f in fields[6].split(';') ] if
fields[6]!='.' else []
+ info = fields[7].split(';') if fields[7]!='.' else []
+ format = _intern(fields[8]) if n>8 else None
+ genostr = fields[9] if n>9 else None
# VCF codes indels with an extra left reference base, which we strip
if normalize_indels:
@@ -58,42 +76,35 @@
ref = _intern(ref[1:])
var = [ _intern(v[1:]) for v in var ]
- yield
VCFRecord(chrom,start,end,names,ref,var,qual,filter,info,format,genos)
+ yield
VCFRecord(chrom,start,end,names,ref,var,qual,filter,info,format,genostr,None)
class VCFReader(object):
- def __init__(self, filename, hyphen=None, normalize_indels=True,
field_size_limit=1024*1024):
- if csv.field_size_limit()<field_size_limit:
- csv.field_size_limit(field_size_limit)
-
+ def __init__(self, filename, hyphen=sys.stdin, normalize_indels=True,
field_size_limit=1024*1024):
self.filename = filename
self.normalize_indels = normalize_indels
self.tabixfile = None
- self.data = data = table_reader(filename,hyphen=sys.stdin)
+ self.data = data =
fileutils.autofile(fileutils.hyphen(filename,hyphen))
self.metadata = metadata = OrderedDict()
self.header = None
self.samples = None
- for row in data:
- if not row:
- continue
- elif row[0].startswith('##'):
- if len(row)!=1:
- raise ValueError('Invalid VCF header line')
-
- meta,value = row[0].split('=',1)
- meta = meta[2:]
+ for line in data:
+ line = line.rstrip()
+
+ if line.startswith('##'):
+ meta,value = line.split('=',1)
+ meta = meta[2:]
if meta not in metadata:
metadata[meta] = []
- metadata[meta].append(row[0])
-
- elif row[0].startswith('#'):
- self.header = header = list(row)
- header[0] = header[0][1:]
-
+ metadata[meta].append(line)
+
+ elif line.startswith('#'):
+ self.header = header = line.split('\t')
+ header[0] = header[0][1:]
self.samples = [ s.split('.')[0] for s in header[9:] ]
break
else:
@@ -109,15 +120,15 @@
if tabixfile is None:
self.tabixfile = tabixfile =
pysam.Tabixfile(self.filename,cache_size=128*1024*1024)
- records = [ r.split('\t') for r in tabixfile.fetch(chromosome, start,
stop) ]
+ records = tabixfile.fetch(chromosome, start, stop)
return _encode_vcf_records(records,self.normalize_indels)
class VCFWriter(object):
- def __init__(self, filename, metadata, names, reference=None):
+ def __init__(self, filename, metadata, names,
reference=None,hyphen=sys.stdout):
self.reference = pysam.Fastafile(reference) if reference is not None
else None
- self.out = out = autofile(hyphen(filename,sys.stdout),'w')
+ self.out = out =
fileutils.autofile(fileutils.hyphen(filename,hyphen),'w')
for meta in metadata:
for m in metadata[meta]:
==============================================================================
Revision: cc14d078fe18
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Sun Aug 5 11:27:40 2012
Log: Move lazy_property into lib.utils
http://code.google.com/p/glu-genetics/source/detail?r=cc14d078fe18
Modified:
/glu/lib/utils.py
/glu/modules/cnv/gdat.py
=======================================
--- /glu/lib/utils.py Sun Apr 8 12:45:05 2012
+++ /glu/lib/utils.py Sun Aug 5 11:27:40 2012
@@ -953,6 +953,23 @@
return genfrom_queue(q)
+def lazy_property(fn):
+ '''
+ Return a lazily computed property.
+
+ This property will compute a read-only result once, but only when
+ requested. All subsequent requests will used the stored result of the
+ first computation.
+ '''
+ attr_name = '_lazy_' + fn.__name__
+ @property
+ def _lazy_property(self):
+ if not hasattr(self, attr_name):
+ setattr(self, attr_name, fn(self))
+ return getattr(self, attr_name)
+ return _lazy_property
+
+
def _test():
import doctest
doctest.testmod()
=======================================
--- /glu/modules/cnv/gdat.py Wed Jul 25 05:07:46 2012
+++ /glu/modules/cnv/gdat.py Sun Aug 5 11:27:40 2012
@@ -8,7 +8,7 @@
from itertools import izip
-from glu.lib.utils import is_str,namedtuple
+from glu.lib.utils import is_str,namedtuple,lazy_property
from glu.lib.fileutils import parse_augmented_filename,
compressed_filename, namefile
from glu.lib.glm import Linear
@@ -32,16 +32,6 @@
LRR_MAX = np.iinfo(LRR_TYPE).max
-def lazy_property(fn):
- attr_name = '_lazy_' + fn.__name__
- @property
- def _lazy_property(self):
- if not hasattr(self, attr_name):
- setattr(self, attr_name, fn(self))
- return getattr(self, attr_name)
- return _lazy_property
-
-
def gdat_encode_f(data, scale, minval, maxval, nanval):
data = data*scale
np.clip(data, minval, maxval, out=data)