[glu-genetics] 2 new revisions pushed by bioinformed@gmail.com on 2012-03-13 00:56 GMT

2 views
Skip to first unread message

glu-ge...@googlecode.com

unread,
Mar 12, 2012, 8:57:24 PM3/12/12
to glu...@googlegroups.com
2 new revisions:

Revision: 1b42273f0fa9
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Mon Mar 12 17:55:54 2012
Log: Add new VCF summary and converstion modules
http://code.google.com/p/glu-genetics/source/detail?r=1b42273f0fa9

Revision: b2a5b928229b
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Mon Mar 12 17:56:32 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=b2a5b928229b

==============================================================================
Revision: 1b42273f0fa9
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Mon Mar 12 17:55:54 2012
Log: Add new VCF summary and converstion modules
http://code.google.com/p/glu-genetics/source/detail?r=1b42273f0fa9

Added:
/glu/modules/seq/vcf2ldat.py
/glu/modules/seq/vcf_summary.py

=======================================
--- /dev/null
+++ /glu/modules/seq/vcf2ldat.py Mon Mar 12 17:55:54 2012
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+
+__gluindex__ = True
+__abstract__ = 'Convert a VCF file to an ldat file'
+__copyright__ = 'Copyright (c) 2010, BioInformed LLC and the U.S.
Department of Health & Human Services. Funded by NCI under Contract
N01-CO-12400.'
+__license__ = 'See GLU license for terms by running: glu license'
+__revision__ = '$Id$'
+
+import sys
+
+import numpy as np
+
+from itertools import izip
+
+from glu.lib.fileutils import table_writer
+from glu.lib.seqlib.vcf import VCFReader
+
+
+def option_parser():
+ from glu.lib.glu_argparse import GLUArgumentParser
+
+ parser = GLUArgumentParser(description=__abstract__)
+
+ parser.add_argument('variants', help='Input VCF variant file')
+
+ parser.add_argument('-o', '--output', metavar='FILE', default='-',
+ help='Output variant file')
+ return parser
+
+
+def main():
+ parser = option_parser()
+ options = parser.parse_args()
+
+ vcf = VCFReader(options.variants,hyphen=sys.stdin)
+ out = table_writer(options.output,hyphen=sys.stdout)
+
+ samples = vcf.samples
+ n = len(samples)
+
+ genomaps = {}
+
+ out.writerow(['ldat']+samples)
+
+ seen = set()
+ non_autosomes = set(['chrX','chrY','chrM'])
+
+ for v in vcf:
+ if v.chrom in non_autosomes or not v.names or len(v.var)!=1:
+ continue
+
+ a,b = ab = v.ref,v.var[0]
+
+ # Count SNPs only for now
+ if len(a)!=1 or len(b)!=1:
+ continue
+
+ name = v.names[0]
+
+ if not name.startswith('rs'):
+ continue
+
+ if name in seen:
+ continue
+
+ seen.add(name)
+
+ genomap = genomaps.get(ab)
+
+ if genomap is None:
+ genomap = genomaps[ab] = {'0/0':a+a,'0/1':a+b,'1/0':a+b,'1/1':b+b}
+
+ genos = [ genomap.get(g[0],' ') for g in v.genos ]
+
+ if genos.count(' ')>n/2:
+ continue
+
+ out.writerow( [name] + genos )
+
+
+if __name__=='__main__':
+ if 1:
+ main()
+ else:
+ try:
+ import cProfile as profile
+ except ImportError:
+ import profile
+ import pstats
+
+ prof = profile.Profile()
+ try:
+ prof.runcall(main)
+ finally:
+ stats = pstats.Stats(prof,stream=sys.stderr)
+ stats.strip_dirs()
+ stats.sort_stats('time', 'calls')
+ stats.print_stats(25)
=======================================
--- /dev/null
+++ /glu/modules/seq/vcf_summary.py Mon Mar 12 17:55:54 2012
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+
+__gluindex__ = True
+__abstract__ = 'Produce summary statistics from an annotated VCF file'
+__copyright__ = 'Copyright (c) 2010, BioInformed LLC and the U.S.
Department of Health & Human Services. Funded by NCI under Contract
N01-CO-12400.'
+__license__ = 'See GLU license for terms by running: glu license'
+__revision__ = '$Id$'
+
+import sys
+
+import numpy as np
+
+from itertools import izip
+
+from glu.lib.fileutils import table_writer
+from glu.lib.seqlib.vcf import VCFReader
+
+
+Ti_alleles = set( [('A','G'), ('G','A'), ('C','T'), ('T','C')] )
+
+
+def option_parser():
+ from glu.lib.glu_argparse import GLUArgumentParser
+
+ parser = GLUArgumentParser(description=__abstract__)
+
+ parser.add_argument('variants', help='Input VCF variant file')
+
+ parser.add_argument('-o', '--output', metavar='FILE', default='-',
+ help='Output variant file')
+ return parser
+
+
+def main():
+ parser = option_parser()
+ options = parser.parse_args()
+
+ vcf = VCFReader(options.variants,hyphen=sys.stdin)
+ out = table_writer(options.output,hyphen=sys.stdout)
+
+ samples = vcf.samples
+ n = len(samples)
+
+ ex_ti_tv = np.zeros( (2,2,n), dtype=int )
+ ge_ti_tv = np.zeros( (2,2,n), dtype=int )
+
+ ex_novel = np.zeros(n, dtype=int)
+ ge_novel = np.zeros(n, dtype=int)
+
+ ex_known = np.zeros(n, dtype=int)
+ ge_known = np.zeros(n, dtype=int)
+
+ variant_genos = set(['1/0','0/1','1/1'])
+
+ for v in vcf:
+ #if float(v.qual)<30:
+ # continue
+
+ # Count SNPs only for now
+ if len(v.ref)!=1 or len(v.var)!=1 or len(v.var[0])!=1:
+ continue
+
+ #missing = np.fromiter( ('.' in g for g in v.genos),
dtype=bool, count=n )
+ #if missing.sum()>n/2:
+ # continue
+
+ variant = np.fromiter( (g[0] in variant_genos for g in v.genos),
dtype=bool, count=n )
+
+ if not variant.sum():
+ continue
+
+ infomap = {}
+ for inf in v.info:
+ if '=' in inf:
+ key,value = inf.split('=',1)
+ else:
+ key,value = inf,''
+
+ infomap[key] = value
+
+ if not v.names and int(infomap.get('REFVAR_OUTGROUP_COUNT',0))>50:
+ continue
+
+ if v.names:
+ ge_known += variant
+ else:
+ ge_novel += variant
+
+ alleles = v.ref,v.var[0]
+
+ ge_ti_tv[bool(v.names),bool(alleles in Ti_alleles)] += variant
+
+ if 'CDS' in infomap.get('GENE_LOCATION',[]):
+ if v.names:
+ ex_known += variant
+ else:
+ ex_novel += variant
+
+ ex_ti_tv[bool(v.names)][bool(alleles in Ti_alleles)] += variant
+
+ ge_ti = ge_ti_tv[0][1] + ge_ti_tv[1][1]
+ ge_tv = ge_ti_tv[0][0] + ge_ti_tv[1][0]
+ ex_ti = ex_ti_tv[0][1] + ex_ti_tv[1][1]
+ ex_tv = ex_ti_tv[0][0] + ex_ti_tv[1][0]
+
+
out.writerow(['SAMPLE','GENOME_KNOWN_VARIANTS','GENOME_NOVEL_VARIANTS','GENOME_NOVEL_FRACTION',
+ 'GENOME_TiTv_ALL', 'GENOME_TiTv_KNOWN', 'GENOME_TiTv_NOVEL',
+ 'EXOME_KNOWN_VARIANTS','EXOME_NOVEL_VARIANTS','EXOME_NOVEL_FRACTION',
+ 'EXOME_TiTv_ALL', 'EXOME_TiTv_KNOWN', 'EXOME_TiTv_NOVEL'])
+
+ rows = izip(samples,ge_known,ge_novel,ge_novel/(ge_novel+ge_known),
+ ge_ti/ge_tv,
+ ge_ti_tv[1][1]/ge_ti_tv[1][0],
+ ge_ti_tv[0][1]/ge_ti_tv[0][0],
+ ex_known,
+ ex_novel,
+ ex_novel/(ex_novel+ex_known),
+ ex_ti/ex_tv,
+ ex_ti_tv[1][1]/ex_ti_tv[1][0],
+ ex_ti_tv[0][1]/ex_ti_tv[0][0])
+
+ out.writerows(rows)
+
+
+if __name__=='__main__':
+ if 1:
+ main()
+ else:
+ try:
+ import cProfile as profile
+ except ImportError:
+ import profile
+ import pstats
+
+ prof = profile.Profile()
+ try:
+ prof.runcall(main)
+ finally:
+ stats = pstats.Stats(prof,stream=sys.stderr)
+ stats.strip_dirs()
+ stats.sort_stats('time', 'calls')
+ stats.print_stats(25)

==============================================================================
Revision: b2a5b928229b
Author: Kevin B. Jacobs <jac...@bioinformed.com>
Date: Mon Mar 12 17:56:32 2012
Log: Remove unused import
http://code.google.com/p/glu-genetics/source/detail?r=b2a5b928229b

Modified:
/glu/modules/seq/filtervcf.py

=======================================
--- /glu/modules/seq/filtervcf.py Sun Nov 6 13:48:32 2011
+++ /glu/modules/seq/filtervcf.py Mon Mar 12 17:56:32 2012
@@ -14,8 +14,6 @@

from collections import defaultdict

-from pysam import Fastafile
-
from glu.lib.fileutils import table_writer, table_reader,
autofile, hyphen, map_reader
from glu.lib.progressbar import progress_loop

Reply all
Reply to author
Forward
0 new messages