3 new revisions:
Revision: 91967327cd62
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:23:11 2012
Log: Allow processing of GFR files with variable calling conventions
(AB,...
http://code.google.com/p/glu-genetics/source/detail?r=91967327cd62
Revision: d439afc7222e
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:25:40 2012
Log: Add better normalization metrics and optional plotting code
http://code.google.com/p/glu-genetics/source/detail?r=d439afc7222e
Revision: c3a69cf26ebd
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:26:42 2012
Log: Add region masking code and fix missing data rate computation in
the...
http://code.google.com/p/glu-genetics/source/detail?r=c3a69cf26ebd
==============================================================================
Revision: 91967327cd62
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:23:11 2012
Log: Allow processing of GFR files with variable calling conventions
(AB,
forward, reverse, top, and bottom).
http://code.google.com/p/glu-genetics/source/detail?r=91967327cd62
Modified:
/glu/lib/illumina.py
/glu/modules/convert/from_gfr.py
=======================================
--- /glu/lib/illumina.py Wed Jul 25 05:12:51 2012
+++ /glu/lib/illumina.py Mon Aug 13 11:23:11 2012
@@ -388,12 +388,13 @@
indelmap = {'D':'-','I':'+','A':'A','C':'C','G':'G','T':'T'}
strandmap = {STRAND_UNKNOWN:STRAND_UNKNOWN,'+':'-','-':'+'}
NA = ('N','A')
+ AB = ('A','B')
cnv = ('-','+')
targetstrand = targetstrand.lower()
validstrand = set(('top','bot','p','plus','m','minus'))
plusminus = set(('p','plus','m','minus'))
- assert targetstrand in ('top','bottom','forward','reverse', \
+ assert targetstrand in ('ab','top','bottom','forward','reverse', \
'real_forward','real_reverse',
'customer','anticustomer','design','antidesign')
@@ -454,6 +455,10 @@
if loc:
loc = int(loc)
+ if targetstrand=='ab':
+ yield lname,chrom,loc,STRAND_UNKNOWN,AB
+ continue
+
# CNV probes can simply be skipped
if (a,b) == NA:
yield lname,chrom,loc,STRAND_UNKNOWN,cnv
=======================================
--- /glu/modules/convert/from_gfr.py Sat Jul 30 20:35:40 2011
+++ /glu/modules/convert/from_gfr.py Mon Aug 13 11:23:11 2012
@@ -105,13 +105,31 @@
return header,chain(rows,indata),num_snps,None,None
-def read_gfr(filename):
+def read_gfr(filename,targetstrand):
+
infile = autofile(filename)
indata = csv.reader(infile,dialect='excel-tab')
header,indata,num_snps,num_samples,manifest = gfr_header(indata)
- fields = ['Sample ID','SNP Name','GC Score','Allele1 -
Forward','Allele2 - Forward',
- 'X','Y','X Raw','Y Raw','B Allele Freq','Log R Ratio']
+ strand = targetstrand.lower()
+
+ fields = ['Sample ID','SNP Name','GC Score']
+
+ if strand=='ab':
+ fields += ['Allele1 - AB','Allele2 - AB']
+ elif strand=='forward':
+ fields += ['Allele1 - Forward','Allele2 - Forward']
+ elif strand=='reverse':
+ fields += ['Allele1 - Reverse','Allele2 - Reverse']
+ elif strand=='top':
+ fields += ['Allele1 - Top','Allele2 - Top']
+ elif strand=='bottom':
+ fields += ['Allele1 - Bottom','Allele2 - Bottom']
+ else:
+ raise ValueError('Unsupported strand convention: %s' % targetstrand)
+
+ fields += ['X','Y','X Raw','Y Raw','B Allele Freq','Log R Ratio']
+
indices = [ header.index(f) for f in fields ]
def data():
@@ -237,7 +255,7 @@
return a
-def gdat_writer(gdat, gfr_data, genome, abmap, transform):
+def gdat_writer(gdat, gfr_data, genome, transform, abmap=None):
sample_chunk = 16
gdat_SNPs = gdat['SNPs']
@@ -328,7 +346,7 @@
snp_recs = []
for i,name in enumerate(snp_names):
- ab = abmap[name]
+ ab = abmap[name] if abmap is not None else 'AB'
loc = locusmap[name]
snp_recs.append( (name,loc.chromosome,loc.location,''.join(ab)) )
@@ -343,7 +361,13 @@
id_chunk.append(sample_id)
- geno_chunk[j] = [ gmap[g] for gmap,g in izip(genomap,geno) ]
+ try:
+ geno_chunk[j] = [ gmap[g] for gmap,g in izip(genomap,geno) ]
+ except KeyError:
+ for name,gmap,g in izip(snp_names,genomap,geno):
+ if g not in gmap:
+ print 'Invalid allele mapping for locus %s, genotype=%s,
mapping=%s' % (name,g,gmap)
+
gc_chunk[j] = gc
x_chunk[j] = x
y_chunk[j] = y
@@ -370,6 +394,8 @@
parser.add_argument('manifest', help='Illumina BPM manifest file')
parser.add_argument('gfrfile', help='Illumina Genotype Final Report
(GFR) file')
+ parser.add_argument('--calls', metavar='CONVENTION',
choices=['AB','forward','reverse','top','bottom'],
+ help='Use specified calling convention for alleles
(default=AB, forward, reverse, top, bottom)')
parser.add_argument('--includesamples', metavar='FILE', action='append',
help='List of samples to include, all others will be
skipped')
parser.add_argument('--excludesamples', metavar='FILE', action='append',
@@ -395,17 +421,19 @@
sys.stderr.write('Loading Illumina manifest file...')
genome = Genome()
- abmap =
create_Illumina_abmap(options.manifest,genome,targetstrand='forward',
+
+ abmap =
create_Illumina_abmap(options.manifest,genome,targetstrand=options.calls,
errorhandler=errorhandler)
+
sys.stderr.write('done.\n')
transform = GenoTransform.from_object(options)
- num_snps,num_samples,manifest,gfr_data = read_gfr(options.gfrfile)
+ num_snps,num_samples,manifest,gfr_data =
read_gfr(options.gfrfile,options.calls)
gdat = create_gdat(options.output, num_snps, num_samples)
#gfr_data = progress_bar(gfr_data,num_samples)
- gdat_writer(gdat, gfr_data, genome, abmap, transform)
+ gdat_writer(gdat, gfr_data, genome, transform, abmap)
attrs = gdat.attrs
attrs['GLU_FORMAT'] = 'gdat'
==============================================================================
Revision: d439afc7222e
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:25:40 2012
Log: Add better normalization metrics and optional plotting code
http://code.google.com/p/glu-genetics/source/detail?r=d439afc7222e
Modified:
/glu/modules/cnv/normalize.py
=======================================
--- /glu/modules/cnv/normalize.py Wed Jul 25 05:07:46 2012
+++ /glu/modules/cnv/normalize.py Mon Aug 13 11:25:40 2012
@@ -240,6 +240,31 @@
t_g /= n_g
+plot_num = 0
+
+def plot(x,y):
+ import
matplotlib.cm as cm
+ import matplotlib.pyplot as plt
+
+ xmin = x.min()
+ xmax = x.max()
+ ymin = y.min()
+ ymax = y.max()
+
+ plt.clf()
+
+ plt.hexbin(x,y, cmap=cm.jet)
+ plt.axis([0, 2.5, -0.25,0.25])
+
+ cb = plt.colorbar()
+ cb.set_label('counts')
+
+ plt.show()
+ global plot_num
+ plot_num += 1
+ plt.savefig('plots/out%04d.png' % plot_num)
+
+
def regress_intensity(x, y, design, dmask, genos, r_AA, r_AB, r_BB,
rmodel='linear', thin=None, minpoints=10000):
AA = genos=='AA'
AB = genos=='AB'
@@ -278,6 +303,9 @@
r_geno = r_geno[valid].reshape(-1,1)
design_valid = design[valid]
+ if 0:
+ plot(r_geno.reshape(-1), design_valid[:,9].reshape(-1))
+
if thin:
if n/thin<minpoints:
thin = n//minpoints
@@ -466,6 +494,8 @@
X = gdat['X']
Y = gdat['Y']
+ LRR = gdat['LRR']
+ BAF = gdat['BAF']
samples = gdat['Samples']
genotypes = gdat['Genotype']
@@ -475,12 +505,12 @@
BAF_QN = BatchTableWriter(gdat['BAF_QN'])
try:
- pass2 = enumerate(parallel_gdat_iter(samples,X,Y,genotypes))
+ pass2 =
enumerate(parallel_gdat_iter(samples,X,Y,genotypes,LRR,BAF))
if options.progress:
pass2 = progress_loop(pass2, length=n, units='samples',
label='PASS 2: ')
- for i,(sample,x,y,genos) in pass2:
+ for i,(sample,x,y,genos,lrr_orig,baf_orig) in pass2:
if not options.progress:
print ' Sample %5d / %d: %s' % (i+1,n,sample)
@@ -493,8 +523,17 @@
lrr,baf = compute_lrr_baf(t,r,r_AA,r_AB,r_BB,t_AA,t_AB,t_BB)
- mask = np.isfinite(lrr)&np.isfinite(baf)
- print ' ... sigmaLRR=%.2f, sigmaBAFhet=%.3f' %
(lrr[mask].std(),baf[mask&(genos=='AB')].std())
+ mask = dmask&np.isfinite(lrr)&np.isfinite(baf)
+ n_lrr_std = lrr[mask].std()
+ n_baf_std = baf[mask&(genos=='AB')].std()
+ #print ' ... Norm: sigmaLRR=%.2f, sigmaBAFhet=%.3f' %
(n_lrr_std,n_baf_std)
+
+ mask = dmask&np.isfinite(lrr_orig)&np.isfinite(baf_orig)
+ o_lrr_std = lrr_orig[mask].std()
+ o_baf_std = baf_orig[mask&(genos=='AB')].std()
+ #print ' ... Orig: sigmaLRR=%.2f, sigmaBAFhet=%.3f' %
(o_lrr_std,o_baf_std)
+
+ print ' ... LRR improvement = %5.2f, BAF improvement = %5.2f' %
(o_lrr_std/n_lrr_std,o_baf_std/n_baf_std)
LRR_QN.write(lrr)
BAF_QN.write(baf)
==============================================================================
Revision: c3a69cf26ebd
Author: Kevin B. Jacobs <
jac...@bioinformed.com>
Date: Mon Aug 13 11:26:42 2012
Log: Add region masking code and fix missing data rate computation in
the
presence of masking.
http://code.google.com/p/glu-genetics/source/detail?r=c3a69cf26ebd
Modified:
/glu/modules/cnv/plot_events.py
/glu/modules/cnv/stats.py
=======================================
--- /glu/modules/cnv/plot_events.py Fri Jul 20 09:54:30 2012
+++ /glu/modules/cnv/plot_events.py Mon Aug 13 11:26:42 2012
@@ -201,6 +201,8 @@
def norm_chromosome(chrom):
+ chrom = chrom.strip()
+
if chrom.startswith('chr'):
chrom = chrom[3:]
if chrom.upper()=='MT':
@@ -448,6 +450,25 @@
return t,df,p
+def parse_chrom_region(s):
+ chrom,rest = s.split(':',1)
+ start,stop = rest.split('-')
+ chrom = norm_chromosome(chrom)
+ start = int(start.replace(',',''))-1
+ stop = int(stop.replace(',',''))
+ return chrom,start,stop
+
+
+def make_mask_dict(masks):
+ chrmask = {}
+ if not masks:
+ return chrmask
+ for mask in masks:
+ chrom,start,stop = parse_chrom_region(mask)
+ chrmask[chrom] = (start,stop)
+ return chrmask
+
+
def option_parser():
from glu.lib.glu_argparse import GLUArgumentParser
@@ -476,6 +497,8 @@
help='List of samples to include,
all others will be skipped')
parser.add_argument('--excludesamples', metavar='FILE', action='append',
help='List of samples to exclude,
only samples not present will be kept')
+ parser.add_argument('--chrmask', action='append',
+ help='Show only chrom:start-stop
region for events on chrom. May be specified multiple times.')
table_options(parser)
@@ -520,6 +543,8 @@
else:
include = exclude = None
+ chrmask = make_mask_dict(options.chrmask)
+
for assay,assay_events in
groupby(events,key=attrgetter(options.assayid)):
if include is not None and assay not in include:
continue
@@ -625,6 +650,15 @@
plotname = '%s/%s' % (options.outdir,plotname)
title = options.title.format(**variables)
+ if chrom in chrmask:
+ start,stop = chrmask[chrom]
+ chrom_mask = pos>=start
+ chrom_mask &= pos< stop
+ pos = pos[chrom_mask]
+ chrom_genos = chrom_genos[chrom_mask]
+ chrom_baf = chrom_baf[chrom_mask]
+ chrom_lrr = chrom_lrr[chrom_mask]
+
plot_chromosome(plotname, pos, chrom_lrr, chrom_baf,
genos=chrom_genos,
title=title, events=chrom_events)
=======================================
--- /glu/modules/cnv/stats.py Fri Jul 20 09:54:30 2012
+++ /glu/modules/cnv/stats.py Mon Aug 13 11:26:42 2012
@@ -3,8 +3,8 @@
from __future__ import division
__gluindex__ = True
-__abstract__ = 'Perform quantile normalization and re-estimate LRR and
BAF'
-__copyright__ = 'Copyright (c) 2011, BioInformed LLC and the U.S.
Department of Health & Human Services. Funded by NCI under Contract
N01-CO-12400.'
+__abstract__ = 'Generate statistics on LRR and BAF for original and
renormalized values'
+__copyright__ = 'Copyright (c) 2012, BioInformed LLC and the U.S.
Department of Health & Human Services. Funded by NCI under Contract
N01-CO-12400.'
__license__ = 'See GLU license for terms by running: glu license'
__revision__ = '$Id$'
@@ -29,6 +29,39 @@
return stats.skewtest(s)[0]
+def normalize_chromosome(chrom):
+ chrom = chrom.strip().upper()
+ if not chrom:
+ return None
+ if chrom.startswith('CHR'):
+ chrom = chrom[3:]
+ if chrom=='MT':
+ chrom = 'M'
+ return chrom
+
+
+def parse_chromosome_region(region):
+ if ':' not in region:
+ chrom = normalize_chromosome(region)
+ start = 0
+ stop = sys.maxint
+ else:
+ chrom,rest = region.split(':',1)
+ start,stop = rest.split('-')
+ chrom = normalize_chromosome(chrom)
+ start = int(start.replace(',',''))-1
+ stop = int(stop.replace(',',''))
+
+ return chrom,start,stop
+
+
+def parse_chromosome_regions(regions):
+ for region in regions:
+ region = region.strip()
+ if region:
+ yield parse_chromosome_region(region)
+
+
def option_parser():
from glu.lib.glu_argparse import GLUArgumentParser
@@ -38,6 +71,8 @@
parser.add_argument('-o', '--output', metavar='FILE', default='-',
help='Output results (default is "-" for standard
out)')
+ parser.add_argument('--chrmask', action='append',
+ help='Show only chrom:start-stop
region for events on chrom. May be specified multiple times.')
parser.add_argument('--chromosomes',metavar='CHRS', default='',
help='Comma separated list of
chromosomes to plot (blank for all)')
parser.add_argument('-P', '--progress', action='store_true',
@@ -75,24 +110,25 @@
np.set_printoptions(linewidth=120,precision=2,suppress=True)
np.seterr(all='ignore')
- mask = None
+ valid_snps,mask = s,None
- if options.chromosomes:
+ if options.chromosomes or options.chrmask:
chrom_index = gdat.chromosome_index
mask = np.zeros(gdat.snp_count, dtype=bool)
- for chrom in options.chromosomes.split(','):
- chrom = chrom.strip().upper()
- if not chrom:
- continue
- if chrom.startswith('CHR'):
- chrom = chrom[3:]
- if chrom=='MT':
- chrom = 'M'
+ regions = (options.chrmask or []) + (options.chromosomes
or '').split(',')
+ regions = parse_chromosome_regions(regions)
- indices = chrom_index[chrom][1]
- mask[indices] = True
+ for chrom,start,stop in regions:
+ print '!!!',chrom,start,stop
+ pos,indices = chrom_index[chrom]
+ region_mask = (pos>=start)&(pos<stop)
+ region_indices = indices[region_mask]
+ mask[region_indices] = True
+
+ valid_snps = mask.sum()
+ print 'Found %d valid probes' % valid_snps
data = parallel_gdat_iter(LRR_orig,LRR_norm,BAF_orig,BAF_norm,genotypes)
@@ -138,8 +174,8 @@
baf_orig_ab_sd = baf_orig[valid&AB].std(ddof=1)
baf_orig_bb_u = baf_orig[valid&BB].mean()
baf_orig_bb_sd = baf_orig[valid&BB].std(ddof=1)
- missing_orig = 1-valid.sum()/s
- #missing_orig = (valid&NN).sum()/s
+ missing_orig = 1-valid.sum()/valid_snps
+ #missing_orig = (valid&NN).sum()/valid_snps
valid = np.isfinite(baf_norm)&np.isfinite(lrr_norm)
count_norm = valid.sum()
@@ -152,8 +188,8 @@
baf_norm_ab_sd = baf_norm[valid&AB].std(ddof=1)
baf_norm_bb_u = baf_norm[valid&BB].mean()
baf_norm_bb_sd = baf_norm[valid&BB].std(ddof=1)
- missing_norm = 1-valid.sum()/s
- #missing_norm = (valid&NN).sum()/s
+ missing_norm = 1-valid.sum()/valid_snps
+ #missing_norm = (valid&NN).sum()/valid_snps
out.writerow([assay,name,
count_orig,lrr_orig_u,lrr_orig_sd,baf_orig_skew,