[dspchip] r231 committed - Started cleaning code and commenting it properly....

9 views

Skip to first unread message

dsp...@googlecode.com

unread,

Feb 11, 2011, 6:08:00 AM2/11/11

to dspchi...@googlegroups.com

Revision: 231
Author: daweonline
Date: Fri Feb 11 03:06:18 2011
Log: Started cleaning code and commenting it properly.
Removed savehist option which was pretty useless...

http://code.google.com/p/dspchip/source/detail?r=231

Modified:
/trunk/src/dclib/DSP.py
/trunk/src/dclib/Helpers.py
/trunk/src/dclib/IO.py
/trunk/src/dclib/NumAnalysis.py
/trunk/src/dclib/__init__.py
/trunk/src/dspchip

=======================================
--- /trunk/src/dclib/DSP.py Wed Feb 9 06:54:59 2011
+++ /trunk/src/dclib/DSP.py Fri Feb 11 03:06:18 2011
@@ -1,5 +1,5 @@
-#
-# Copyright (c) 2010, Davide Cittaro
+#
+# Copyright (c) 2010-2011, Davide Cittaro
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
=======================================
--- /trunk/src/dclib/Helpers.py Sun Jan 9 05:17:54 2011
+++ /trunk/src/dclib/Helpers.py Fri Feb 11 03:06:18 2011
@@ -1,5 +1,5 @@
-#
-# Copyright (c) 2010, Davide Cittaro
+#
+# Copyright (c) 2010-2011, Davide Cittaro
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -32,6 +32,7 @@
return '[ ' + ctime() + ' ]'

def Debug(nl= 1,*msg):
+ # This function just writes an informational string to stderr
msg = ' '.join([str(x) for x in msg])
if nl == 1:
msg += '\n'
=======================================
--- /trunk/src/dclib/IO.py Fri Feb 11 02:24:08 2011
+++ /trunk/src/dclib/IO.py Fri Feb 11 03:06:18 2011
@@ -81,7 +81,6 @@

theParser.add_option("--nosig",action="store_true",dest="nosig",help="Don't
perform signal analysis", default=False)

theParser.add_option("--noprofile",action="store_true",dest="noprof",help="Don't
save a profile file", default=False)

theParser.add_option("--save",action="store_true",dest="npysave",help="Save
chromosome data into npy files", default=False)
-
theParser.add_option("--hist",action="store_true",dest="savehist",help="Save
histograms for processed signals", default=False)

theParser.add_option("--stats",action="store_true",dest="savestats",help="Save
basic statistics for processed signals", default=False)

@@ -102,8 +101,7 @@
elif options.profileFormat == 'bw':
options.profile = options.name+".bigwig"
else:
- Debug(1, "Profile output should be either bedgraph or bigwig")
- sys.exit(1)
+ Debug(5, "Profile output should be either bedgraph or bigwig")

options.lastoffsetA = 0 # This will be used by wig, bed and bar readers
options.telldictA = {}
@@ -127,14 +125,11 @@

# check that options that reduce to a single dataset are chosen one at
time
if 'R' in options.pipeline and 'S' in options.pipeline:
- Debug(1, "Subtract and Ratio steps in pipeline are mutually exclusive")
- sys.exit(1)
+ Debug(5, "Subtract and Ratio steps in pipeline are mutually exclusive")
if 'R' in options.pipeline and 'L' in options.pipeline:
- Debug(1, "Log2ratio and Ratio steps in pipeline are mutually
exclusive")
- sys.exit(1)
+ Debug(5, "Log2ratio and Ratio steps in pipeline are mutually
exclusive")
if 'S' in options.pipeline and 'L' in options.pipeline:
- Debug(1, "Subtract and Ratio steps in pipeline are mutually exclusive")
- sys.exit(1)
+ Debug(5, "Subtract and Ratio steps in pipeline are mutually exclusive")

if not options.sigB:
# remove subtract from pipeline if control is not present
@@ -161,8 +156,7 @@

if not options.chromsize and True not in [x == y for x in
[options.formatA, options.formatB] for y in ['sam', 'bw']]:
# If there's no chromSize file and no other format is indexed, raise
the error
- Debug(1, "Please, provide a table with chromosome sizes")
- sys.exit(1)
+ Debug(5, "Please, provide a table with chromosome sizes")

if options.pvaluefilter < 1:
@@ -170,10 +164,6 @@
# peak searching is enabled...
options.findPeaks = True

- if options.savehist == True:
- # if histograms are saved, save also statistics
- options.savestats = True
-
options.peakInterspersion = np.abs(options.peakInterspersion)
if not options.wstep:
options.wstep = options.expWindow / (options.peakInterspersion + 1)
@@ -185,12 +175,10 @@

if options.correlate == True:
if not options.sigB:
- Debug(1, "Please, provide another file")
- sys.exit(1)
+ Debug(5, "Please, provide another file")
removePipelineSteps(options, 'SLR')
# also remove some steps that are pretty much useless here
options.findPeaks = False
- options.savehist = False
options.savestats = False
options.noprof = True
options.zerodata = False
@@ -199,8 +187,7 @@
# some checks on negative values...
for value in [options.pvaluefilter, options.peakRatio, options.wstep,
options.profStep, options.expWindow, options.mapQuality + 1]:
if value <= 0:
- Debug(1, "Mmm... I'm afraid I won't be able to process negative
values such as", value)
- sys.exit(1)
+ Debug(5, "Mmm... I'm afraid I won't be able to process negative
values such as", value)

options.windowingFunction = options.windowingFunction.lower()
if options.windowingFunction not in ['mean', 'med', 'max']:
@@ -253,7 +240,6 @@
Debug(1, "##", "Perform signal analysis:", not options.nosig)
Debug(1, "##", "Write bedGraph file:", not options.noprof)
Debug(1, "##", "Save npy data:", options.npysave)
- Debug(1, "##", "Save histograms for signal:", options.savehist)
Debug(1, "##", "Save basic statistics:", options.savestats)

@@ -335,8 +321,7 @@
try:
pipe = subprocess.Popen([bwInfo, "-chroms", fname], shell=False,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except OSError:
- Debug(1, "Please Install bigWigInfo executable to read bigwig files")
- sys.exit(1)
+ Debug(5, "Please Install bigWigInfo executable to read bigwig files")
if pipe.stderr.read() == '-chroms is not a valid option\n':
# an old version of bigWigInfo is installed...
# chrom sizes are output by default
@@ -537,9 +522,8 @@
fh = open(fname, 'rb')
buffer = fh.read(8)
if buffer != 'barr\r\n\x1a\n':
- Debug(1, "Wrong file format")
fh.close()
- sys.exit(1)
+ Debug(5, "Wrong file format")

buffer = fh.read(12)
(version, nseq, ncol) = struct.unpack('>fii', buffer)
@@ -628,8 +612,7 @@
try:
pipe = subprocess.Popen([bw2bdg, "-chrom=" + chrom, fname, "stdout"],
shell=False, stdout=subprocess.PIPE).stdout
except OSError:
- Debug(1, "Please Install bigWigToBedGraph executable to read bigwig
files")
- sys.exit(1)
+ Debug(5, "Please Install bigWigToBedGraph executable to read bigwig
files")

for (theChrom, pos, val) in bx.wiggle.Reader(pipe):
b[pos] = val
@@ -828,31 +811,6 @@
np.save(fname, data)

-def saveHistogram(data, chrom, options):
-# nbins = len(data) / (2 * options.expWindow)
-# if nbins < 100:
-# nbins = 100
- nbins = np.ceil(np.log2(len(data)) + 1)
- fname = options.name + "_" + chrom + "_hist.png"
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.set_title('Signal distribution for '+ chrom)
- ax.set_xlabel("Signal value")
- ax.set_ylabel("Probability")
- ax.grid()
- hp, bp, pp = ax.hist(data, bins=nbins, normed=1, alpha=0.3,
color='blue', histtype='stepfilled')
- hc, bc, pc = ax.hist(data, bins=nbins, normed=1, alpha=0.3, color='red',
histtype='stepfilled', cumulative=1)
- ax.hlines(0.05, bc[0], bc[-1], linestyles='dashed')
- ax.hlines(0.95, bc[0], bc[-1], linestyles='dashed')
- fig.savefig(fname, dpi=96)
-
- fname = options.name + "_" + chrom + "_hist.csv"
- fh = open(fname, 'w')
- for b, h in enumerate(hc):
- fh.write(str(h) + ',' + str(bc[b]) + '\n')
- fh.close()
-
-
def saveStats(signal, chrom, options):
fname = options.name + "_" + chrom + "_stats.txt"
stats = summaryStatistics(signal)
=======================================
--- /trunk/src/dclib/NumAnalysis.py Fri Feb 11 02:01:08 2011
+++ /trunk/src/dclib/NumAnalysis.py Fri Feb 11 03:06:18 2011
@@ -1,5 +1,5 @@
-#
-# Copyright (c) 2010, Davide Cittaro
+#
+# Copyright (c) 2010-2011, Davide Cittaro
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
=======================================
--- /trunk/src/dclib/__init__.py Sun Jan 9 06:11:55 2011
+++ /trunk/src/dclib/__init__.py Fri Feb 11 03:06:18 2011
@@ -1,5 +1,5 @@
-#
-# Copyright (c) 2010, Davide Cittaro
+#
+# Copyright (c) 2010-2011, Davide Cittaro
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
=======================================
--- /trunk/src/dspchip Fri Feb 11 00:28:47 2011
+++ /trunk/src/dspchip Fri Feb 11 03:06:18 2011
@@ -1,7 +1,7 @@
#!/usr/bin/env python

#
-# Copyright (c) 2010, Davide Cittaro
+# Copyright (c) 2010-2011, Davide Cittaro
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -33,27 +33,28 @@

def main():

+ # get command line options and print them
options = dclib.IO.parseOptions(dclib.IO.prepareOptions())
-
dclib.IO.debugParameters(options)

- # assume we only want to analyze chromosomes that appear
- # in chip... we should provide a tab file for chromosomes if wig format
- # is called... bah...
+ # get chromosome length. This is useful as we treat chromosome as fixed
size arrays
+ # we need to allocate space...
chromLength = dclib.IO.getChromosomeSizes(options)
chromNames = chromLength.keys()
chromNames.sort()

+ # open handlers for output files, if needed
if not options.noprof and options.profileFormat == 'bdg':
bdgh = dclib.IO.openBedGraph(options)
if options.findPeaks:
peaksh = dclib.IO.openPeaks(options)

+ # Here starts everything: a giant loop over chromosomes!
+ # basically we scan the pipeline options and we repeat that for each
chromosome
for chrom in chromNames:
- theSignalA = np.array([])
- theSignalB = np.array([])
- OneDataSet = False
- sigHist = np.array([])
+ theSignalA = np.array([]) # signal A, typically the ChIP
+ theSignalB = np.array([]) # signal B, typicall the Input or the mock
+ OneDataSet = False # is there more than one dataset?

dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Processing
chromosome", chrom)
dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Reading",
options.sigA)
@@ -62,14 +63,18 @@
dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Reading",
options.sigB)
theSignalB = dclib.IO.inputWrapper(chrom, chromLength[chrom],
options, 0)

+ # From this point on, every step may be saved into a npy object, to
load in ipython
+ # and debug what's wrong...
if options.npysave:
dclib.IO.npysave(theSignalA, options.name, chrom, "SignalA", "RAW")
if options.sigB:
dclib.IO.npysave(theSignalB, options.name, chrom, "SignalB", "RAW")

+ # Begin to loop over pipeline steps. Note that pipeline is a string
and we exploit
+ # python string iterator.
for analysisStep in options.pipeline:
if analysisStep == 'F':
- # FIR filter
+ # Fast Fourier Transform smoothing
theSignalA = dclib.NumAnalysis.PPLFFT(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLFFT(theSignalB, options)
@@ -78,7 +83,7 @@
if options.sigB and not OneDataSet:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "FFT")
if analysisStep == 'W':
- # FIR filter
+ # Wavelet denoising
theSignalA = dclib.NumAnalysis.PPLWavelet(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLWavelet(theSignalB, options)
@@ -87,7 +92,7 @@
if options.sigB and not OneDataSet:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "wltd")
if analysisStep == 'A':
- # FIR filter
+ # Autocorrelation
theSignalA = dclib.NumAnalysis.PPLAutocorrelate(theSignalA,
options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLAutocorrelate(theSignalB,
options)
@@ -105,7 +110,7 @@
if options.sigB and not OneDataSet:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "GS")
elif analysisStep == 'E':
- # equalize
+ # Signal equalization
theSignalA = dclib.NumAnalysis.PPLequalize(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLequalize(theSignalB, options)
@@ -114,7 +119,7 @@
if options.sigB and not OneDataSet:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "EQ")
elif analysisStep == 'N':
- # normalize
+ # Signal normalization
theSignalA = dclib.NumAnalysis.PPLnormalize(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLnormalize(theSignalB, options)
@@ -123,32 +128,31 @@
if options.sigB and not OneDataSet:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "Norm")
elif analysisStep == 'S':
- # subtract
+ # Subtract signal B from signal A
theSignalA = dclib.NumAnalysis.PPLsubtract(theSignalA, theSignalB,
options)
del(theSignalB)
# delete options.sigB so that this can be a step in the middle of
- # the pipeline
+ # the pipeline. Now we are running with a single dataset
OneDataSet = True
if options.npysave:
dclib.IO.npysave(theSignalA, options.name,
chrom, "SignalA", "diff")
elif analysisStep == 'R':
- # subtract
+ # Calculate data ratio, i.e. (A-B)/A
theSignalA = dclib.NumAnalysis.PPLRatio(theSignalA, theSignalB,
options)
del(theSignalB)
- # delete options.sigB so that this can be a step in the middle of
- # the pipeline
+ # As above, reduce to a single dataset
OneDataSet = True
if options.npysave:
dclib.IO.npysave(theSignalA, options.name,
chrom, "SignalA", "ratio")
elif analysisStep == 'L':
- # subtract
+ # Calculate log2ratio between A and B. This is, by far, the most
common step
theSignalA = dclib.NumAnalysis.PPLL2R(theSignalA, theSignalB,
options)
del(theSignalB)
OneDataSet = True
if options.npysave:
dclib.IO.npysave(theSignalA, options.name,
chrom, "SignalA", "l2r")
elif analysisStep == 'B':
- # baseline correction
+ # Perform baseline correction. This step needs some refinement...
theSignalA = dclib.NumAnalysis.PPLbaseline(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLbaseline(theSignalB, options)
@@ -157,7 +161,8 @@
if options.sigB:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "bsl")
elif analysisStep == 'Z':
- # baseline correction
+ # Remove negative values. Used when we want a 1-tail p-value
estimation for peaks
+ # and also when we are interested only in signal A properties
theSignalA = dclib.NumAnalysis.PPLZero(theSignalA, options)
if not OneDataSet and options.sigB:
theSignalB = dclib.NumAnalysis.PPLZero(theSignalB, options)
@@ -166,10 +171,9 @@
if options.sigB:
dclib.IO.npysave(theSignalB, options.name,
chrom, "SignalB", "zero")

- # to get p-values for peak heights we calculate histograms
- # for signal intensities.
-
- if options.correlate:
+ if options.correlate and not OneDataSet:
+ # Here we calculate correlation between signal A and signal B. This
will
+ # plot a chart and linear regression data.
corr_results = dclib.NumAnalysis.PPLCorrelate(theSignalA,
theSignalB, options)
ccoef = corr_results[2][0]
cpvalue = corr_results[2][1]
@@ -178,15 +182,12 @@
dclib.IO.npysave(corr_results[1], options.name, chrom, "xc2")
dclib.IO.saveCorrelation(corr_results, chrom, options)

- if options.savehist:
- dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Saving signal
histogram into file")
- dclib.IO.saveHistogram(theSignalA, chrom, options)
-
if options.savestats:
+ # save basic statistics. This is quite useless :-)
dclib.IO.saveStats(theSignalA, chrom, options)
-

if not options.noprof:
+ # Save Signal into a the profile... Actually only signal A is saved.
@@FIXTHIS@@
if options.profileFormat == 'bdg':
dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Writing
profile data in bedgraph")
dclib.IO.writeBedGraph(bdgh, theSignalA, options, chrom=chrom)
@@ -195,12 +196,18 @@
dclib.IO.writeBigWig(theSignalA, options, chromLength, chrom=chrom)

if options.findPeaks:
+ # Save peaks into the .peaks file. Only peaks in signal A will be
retained. @@FIXTHIS@@
dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Finding peaks
boundaries")
peaks = dclib.NumAnalysis.findPeaks(theSignalA, options)
dclib.Helpers.Debug(1, dclib.Helpers.timestring(), "Writing peak
data in peaks file")
dclib.IO.writePeaks(peaksh, theSignalA, peaks, options, chrom=chrom)

+ # Explicitly free some memory...
del(theSignalA)
+ if not OneDataSet:
+ del(theSignalB)
+
+ # At the end of the loop it's time to close file handlers...
if not options.noprof and options.profileFormat == 'bdg':
dclib.IO.closeBedGraph(bdgh)
if options.findPeaks:

Reply all

Reply to author

Forward

0 new messages