[glu-genetics] push by bioinformed@gmail.com - Minor updates and tweaks on 2012-03-14 13:30 GMT

4 views

Skip to first unread message

glu-ge...@googlecode.com

unread,

Mar 14, 2012, 9:30:35 AM3/14/12

to glu...@googlegroups.com

Revision: b84fadb5076e
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Wed Mar 14 06:29:49 2012
Log: Minor updates and tweaks
http://code.google.com/p/glu-genetics/source/detail?r=b84fadb5076e

Modified:
/glu/lib/fileutils/auto.py
/glu/lib/seqlib/cga.py
/glu/lib/seqlib/vcf.py
/glu/modules/seq/annotate.py

=======================================
--- /glu/lib/fileutils/auto.py Sat Jan 8 10:06:11 2011
+++ /glu/lib/fileutils/auto.py Wed Mar 14 06:29:49 2012
@@ -137,7 +137,7 @@
return COMPRESSED_SUFFIXES.get(ext,'')

-def autofile(filename, mode='r'):
+def autofile(filename, mode='r', bufsize=-1):
'''
Return a file object in the correct compressed format as specified, which
is ready to read from or write to
@@ -161,16 +161,16 @@
f = file(filename, mode)
elif comp == 'gzip':
try:
- f = spawn_compressor(os.environ.get('GLU_GZIP','gzip'), filename,
mode)
+ f = spawn_compressor(os.environ.get('GLU_GZIP','gzip'), filename,
mode, bufsize=bufsize)
except _autofile_errors:
import gzip
f = gzip.GzipFile(filename, mode)
elif comp == 'bzip2':
try:
- f = spawn_compressor(os.environ.get('GLU_BZIP2','bzip2'), filename,
mode)
+ f = spawn_compressor(os.environ.get('GLU_BZIP2','bzip2'), filename,
mode, bufsize=bufsize)
except _autofile_errors:
import bz2
- f = bz2.BZ2File(filename, mode)
+ f = bz2.BZ2File(filename, mode, buffering=max(0,bufsize))

return f

=======================================
--- /glu/lib/seqlib/cga.py Wed Jan 11 05:01:28 2012
+++ /glu/lib/seqlib/cga.py Wed Mar 14 06:29:49 2012
@@ -38,7 +38,9 @@
teinAcc symbol orientation component componentIndex
hasCodingRegion impact nucleotidePos proteinPosannotationRefSequence
sampleSequence genomeRefSequence pfam
'''

- cgafile =
csv.reader(hyphen(autofile(filename),hyfile),dialect='excel-tab')
+ bufsize = kwargs.get('bufsize',-1)
+
+ cgafile =
csv.reader(hyphen(autofile(filename,bufsize=bufsize),hyfile),dialect='excel-tab')

extra = kwargs.get('extra',None)
attrs = {}
@@ -68,9 +70,9 @@

if extra:
e = [None]*len(extra)
- records = (Row._make(row+e) for row in cgafile)
+ records = (Row(*(row+e)) for row in cgafile)
else:
- records = (Row._make(row) for row in cgafile)
+ records = (Row(*row) for row in cgafile)

return attrs,header,records

@@ -86,7 +88,11 @@

for rec in records:
rec.chromosome =
cache_str(rec.chromosome,rec.chromosome)
- rec.position = int(rec.position)
+ if hasattr(rec,'position'):
+ rec.position = int(rec.position)
+ else:
+ rec.begin = int(rec.begin)
+ rec.end = int(rec.end)
rec.uniqueSequenceCoverage = float(rec.uniqueSequenceCoverage)
rec.weightSumSequenceCoverage = float(rec.weightSumSequenceCoverage)
rec.gcCorrectedCvg = float(rec.gcCorrectedCvg)
@@ -237,11 +243,11 @@
records = []

for g in genetext.split(';'):
- gene = generecord._make(g.split(':'))
+ gene = generecord(*g.split(':'))

gene.geneId = int(gene.geneId)
gene.mrnaAcc = gene.mrnaAcc or None
- gene.symbol = gene.symbol or None
+ gene.symbol = gene.symbol or None
gene.component = cache_str(gene.component,gene.component)
gene.impact = cache_str(gene.impact,gene.impact)

@@ -315,6 +321,10 @@
str_cache = {'':None}
cache_str = str_cache.setdefault

+ skip_ref = kwargs.get('skip_ref',False)
+ skip_lq = kwargs.get('skip_lq',False)
+ cache_strings = kwargs.get('cache_strings',False)
+
def parseGene(genetext):
if not genetext:
return None
@@ -322,21 +332,21 @@
records = []

for g in genetext.split(';'):
- gene = generecord._make(g.split(':'))
+ gene = generecord(*g.split(':'))

gene.geneId = int(gene.geneId)
gene.mrnaAcc = gene.mrnaAcc or None
- gene.symbol = gene.symbol or None
- gene.component = cache_str(gene.component,gene.component)
- gene.impact = cache_str(gene.impact,gene.impact)
+ gene.symbol = gene.symbol or None
+
+ if cache_strings:
+ gene.component = cache_str(gene.component,gene.component)
+ gene.impact = cache_str(gene.impact,gene.impact)

records.append(gene)

return records

- missing = ('','N')
- skip_ref = kwargs.get('skip_ref',False)
- skip_lq = kwargs.get('skip_lq',False)
+ missing = ('','N')

for record in records:
if skip_ref and (record.zygosity=='no-call' or record.varType=='ref'):
@@ -353,38 +363,41 @@

record.locus = int(record.locus)
record.ploidy = int(record.ploidy)
- record.chromosome =
cache_str(record.chromosome,record.chromosome)
record.begin = int(record.begin)
record.end = int(record.end)
- record.zygosity =
cache_str(record.zygosity,record.zygosity)
- record.varType =
cache_str(record.varType,record.varType)
- record.reference =
cache_str(record.reference,record.reference)
- record.allele1Seq =
cache_str(record.allele1Seq,record.allele1Seq)
- record.allele2Seq =
cache_str(record.allele2Seq,record.allele2Seq)
record.allele1VarScoreVAF = int(record.allele1VarScoreVAF) if
record.allele1VarScoreVAF else None
record.allele2VarScoreVAF = int(record.allele2VarScoreVAF) if
record.allele2VarScoreVAF else None
record.allele1VarScoreEAF = int(record.allele1VarScoreEAF) if
record.allele1VarScoreEAF else None
record.allele2VarScoreEAF = int(record.allele2VarScoreEAF) if
record.allele2VarScoreEAF else None
- record.allele1VarQuality = cache_str(record.allele1VarQuality)
- record.allele2VarQuality = cache_str(record.allele2VarQuality)
record.allele1HapLink = record.allele1HapLink or None
record.allele2HapLink = record.allele2HapLink or None
- record.allele1XRef = record.allele1XRef or None
- record.allele2XRef = record.allele2XRef or None
+ record.allele1XRef = record.allele1XRef or None
+ record.allele2XRef = record.allele2XRef or None
record.evidenceIntervalId = int(record.evidenceIntervalId) if
record.evidenceIntervalId else None
record.allele1ReadCount = int(record.allele1ReadCount) if
record.allele1ReadCount else None
record.allele2ReadCount = int(record.allele2ReadCount) if
record.allele2ReadCount else None
record.referenceAlleleReadCount = int(record.referenceAlleleReadCount)
if record.referenceAlleleReadCount else None
record.totalReadCount = int(record.totalReadCount) if
record.totalReadCount else None
- record.pfam = record.pfam or None
- record.miRBaseId = record.miRBaseId or None
- record.repeatMasker = record.repeatMasker or None
- record.segDupOverlap = record.segDupOverlap or None
+ record.pfam = record.pfam or None
+ record.miRBaseId = record.miRBaseId or None
+ record.repeatMasker = record.repeatMasker or None
+ record.segDupOverlap = record.segDupOverlap or None
record.relativeCoverageDiploid =
float(record.relativeCoverageDiploid) if record.relativeCoverageDiploid not
in missing else None
record.calledPloidy = int(record.calledPloidy) if
record.calledPloidy not in missing else None

+ if cache_strings:
+ record.chromosome =
cache_str(record.chromosome,record.chromosome)
+ record.zygosity =
cache_str(record.zygosity,record.zygosity)
+ record.varType =
cache_str(record.varType,record.varType)
+ record.reference =
cache_str(record.reference,record.reference)
+ record.allele1Seq =
cache_str(record.allele1Seq,record.allele1Seq)
+ record.allele2Seq =
cache_str(record.allele2Seq,record.allele2Seq)
+ record.allele1VarQuality =
cache_str(record.allele1VarQuality,record.allele1VarQuality)
+ record.allele2VarQuality =
cache_str(record.allele2VarQuality,record.allele2VarQuality)
+
if hasattr(record,'relativeCoverageNondiploid'):
record.relativeCoverageNondiploid =
float(record.relativeCoverageNondiploid) if
record.relativeCoverageNondiploid not in missing else None
+
if hasattr(record,'calledLevel'):
record.calledLevel = float(record.calledLevel) if
record.calledLevel not in missing else None

=======================================
--- /glu/lib/seqlib/vcf.py Wed Jan 25 17:55:31 2012
+++ /glu/lib/seqlib/vcf.py Wed Mar 14 06:29:49 2012
@@ -70,7 +70,11 @@
filter = [ intern(f) for f in row[6].split(';') ] if row[6]!='.'
else []
info = row[7].split(';')
format = intern(row[8])
- genos = [ g.split(':') for g in row[9:] ] if len(row)>9 else
None
+
+ if ':' in format:
+ genos = [ g.split(':') for g in row[9:] ] if len(row)>9 else
None
+ else:
+ genos = [ [g] for g in row[9:] ] if len(row)>9 else
None

# VCF codes indels with an extra left reference base, which we strip
r = ref[0]
=======================================
--- /glu/modules/seq/annotate.py Tue Feb 21 06:30:56 2012
+++ /glu/modules/seq/annotate.py Wed Mar 14 06:29:49 2012
@@ -130,7 +130,7 @@
v.filter.append('Intergenic')

if not nsevidence:
- v.filter.append('NotPredictedFunctional')
+ v.filter.append('NPF')

if cytoband:
new_info.append('CYTOBAND=%s' % (','.join(cytoband)))
@@ -155,7 +155,8 @@
if cvinfo.exact_vars:
v.names = sorted(set(v.replace('dbsnp:','rs') for v in
cvinfo.exact_vars)|set(v.names))

- new_info.append('COMMON_SCORE=%.2f' % cvinfo.common_score)
+ if cvinfo.exact_vars or cvinfo.common_score>0:
+ new_info.append('COMMON_SCORE=%.2f' % cvinfo.common_score)

if cvinfo.function_info:
function_info = ','.join(cvinfo.function_info)
@@ -186,26 +187,27 @@

new_info.append('KAVIAR_NAMES=%s' % ','.join(ktext))

- ingroup,outgroup = refvars.get(v.chrom,v.start,v.end,v.var) if refvars
else ([],[])
-
- new_info.append('REFVAR_INGROUP_COUNT=%d' % len(ingroup))
- new_info.append('REFVAR_OUTGROUP_COUNT=%d' % len(outgroup))
-
- if ingroup:
- ingroup = ','.join(ingroup)
- new_info.append('REFVAR_INGROUP_NAMES=%s' % ingroup)
-
- if outgroup:
- outgroup = ','.join(outgroup)
- new_info.append('REFVAR_OUTGROUP_NAMES=%s' % outgroup)
- v.filter.append('RefVar')
+ if refvars:
+ ingroup,outgroup = refvars.get(v.chrom,v.start,v.end,v.var) if refvars
else ([],[])
+
+ new_info.append('REFVAR_INGROUP_COUNT=%d' % len(ingroup))
+ new_info.append('REFVAR_OUTGROUP_COUNT=%d' % len(outgroup))
+
+ if ingroup:
+ ingroup = ','.join(ingroup)
+ new_info.append('REFVAR_INGROUP_NAMES=%s' % ingroup)
+
+ if outgroup:
+ outgroup = ','.join(outgroup)
+ new_info.append('REFVAR_OUTGROUP_NAMES=%s' % outgroup)
+ v.filter.append('RefVar')

if 'tgp' in v.names:
v.names.remove('tgp')
v.filter.append('1000G')

- if any(g[0]=='./.' for g in v.genos):
- v.filter.append('PartiallyCalled')
+ #if any(g[0]=='./.' for g in v.genos):
+ # v.filter.append('PartiallyCalled')

if not v.ref or v.var==['']:
v.filter.append('Indel')
@@ -239,7 +241,7 @@

metadata['FILTER'].append('##FILTER=<ID=PartiallyCalled,Description="Variant
is not called for one or more samples">')
metadata['FILTER'].append('##FILTER=<ID=Indel,Description="Variant is an
insertion or deletion">')
metadata['FILTER'].append('##FILTER=<ID=Intergenic,Description="Variant
not in or near a gene">')
-
metadata['FILTER'].append('##FILTER=<ID=NotPredictedFunctional,Description="Variant
is not predicted to alter a protein">')
+ metadata['FILTER'].append('##FILTER=<ID=NPF,Description="Variant is not
predicted to alter a protein">')

if cv:
metadata['FILTER'].append('##FILTER=<ID=Common,Description="Variant is
likely common with common score>%f">' % options.commonscore)

Reply all

Reply to author

Forward

0 new messages