[dspchip] r232 committed - More cleaning and comments....

3 views

Skip to first unread message

dsp...@googlecode.com

unread,

Feb 11, 2011, 8:22:06 AM2/11/11

to dspchi...@googlegroups.com

Revision: 232
Author: daweonline
Date: Fri Feb 11 05:21:08 2011
Log: More cleaning and comments.
Little modification in peak finding. It used to ignore negative values if
control was not given, now evaluates them unless 'Z' step is specified

http://code.google.com/p/dspchip/source/detail?r=232

Modified:
/trunk/src/dclib/NumAnalysis.py

=======================================
--- /trunk/src/dclib/NumAnalysis.py Fri Feb 11 03:06:18 2011
+++ /trunk/src/dclib/NumAnalysis.py Fri Feb 11 05:21:08 2011
@@ -33,118 +33,112 @@
import DSP

def PPLZero(data, options):
+ # Dummiest function ever... remove negative values
Debug(1, timestring(), "About to remove all negative values")
data[data < 0] = 0
return data

+
def PPLAutocorrelate(data, options):
Debug(1, timestring(), "About to perform autocorrelation")
data = DSP.autocorrelate(data, options.expWindow, options.wstep)
return data

+
def PPLFFT(data, options):
Debug(1, timestring(), "About to perform FFT smooth")
data = DSP.fftsmooth(data, options.expWindow)
return data

+
def PPLWavelet(data, options):
Debug(1, timestring(), "About to perform wavelet denoising")
data = DSP.wltdenoise(data, options.wavelet, options.expWindow)
return data

+
def PPLGaussSmooth(data, options):
Debug(1, timestring(), "Applying Gaussian filter to smooth data")
data = DSP.gsmooth(data, options.expWindow)
return data

+
def PPLequalize(data, options):
Debug(1, timestring(), "Equalizing...")
return DSP.equalize(data, options.expWindow)

+
def PPLnormalize(data, options):
Debug(1, timestring(), "Normalizing...")
return DSP.normalize(data)

+
def PPLsubtract(data1, data2, options):
Debug(1, timestring(), "Subtracting signal B from signal A")
return data1 - data2

+
def PPLRatio(data1, data2, options):
Debug(1, timestring(), "Calculating data ratio")
+ # To be sure we don't have any zerodivision we set to a relatively small
value
+ # zero values. This should not influence anything, as we are calculating
the
+ # difference over a single value...
epsilon = 1e-30
- data1 += epsilon
- data2 += epsilon
+ data1[data1 == 0] = epsilon
+ data2[data2 == 0] = epsilon
return (data1 - data2) / np.abs(data2)

+
def PPLL2R(data1, data2, options):
- # at this moment we calculate this value only when we have data for
numerator.
- # we assume all values are positive and don't make any sanity check <---
baaad!
Debug(1, timestring(), "Calculating data log2ratio")
-
-# #get the zeros for denominator, we will skip these, actually we will
set them to zero
-# zmask = (data1 != 0) & (data2 != 0)
-# #create a zeroed output array
-# dataout = np.zeros(len(data1))
-#
-# #calculate the ratio for masked values. We
-# dataout[zmask] = np.log2(data1[zmask] / data2[zmask])
-
- # I was thinking: skipping values may not be a good idea, especially
because
- # we perform averages and statistics on those and the zero, somehow,
will count
- # I guess I'll go for the pseudocount right now...
-
+ # There's a huge room to improve this. Better ideas are welcome!
+ # Right now we don't make any check on negative values. Log2Ratio has
been here implemented
+ # for strictly positive data, either in signal B and signal B. We just
add 1 to both
+ # to avoid division by 0 or, worst, logarithm of 0
return np.log2((data1 + 1) / (data2 + 1))

+
def PPLadd(data1, data2, options):
Debug(1, timestring(), "Adding signal B to signal A")
return data1 + data2

+
def PPLbaseline(data, options):
if options.baseline:
# No baseline by default....
Debug(1, timestring(), "Applying baseline correction",
options.baseline, "to signal")
return DSP.baseline(data, options.baseline, options.expWindow)

+
def PPLCorrelate(theSignalA, theSignalB, options):
# first we build two vectors made by integrals for windows
# not sliding... then we calculate the correlation coefficient
# and ideally we plot the chart possibly with a correlation line :-D
Debug(1, timestring(), "Correlating signals")
-
return DSP.correlateSignals(theSignalA, theSignalB, options.expWindow)

-def peakModule(data):
- if not len(data):
- return -5
- if np.max(data) >= 0 and np.min(data) >= 0:
- return 1
- if np.max(data) < 0 and np.min(data) < 0:
- return -1
- return 0
-
-
def growPeak(data, start, end, newStart, newEnd, interMaxDist, options):
- # this function joins peaks if they aren't divided
- # by a deep valley
+ # this function joins peaks consistently with data shape and user
options....
# return True if the peaks should be joined or not...
- # also, if peaks are too close they are joined
-
- if newEnd == end:
+
+ if newEnd <= end or newStart < end:
+ # This is trivial... the peaks overlap and the second is included int
the first...
return True

- peak1Data = data[start:end]
- peak2Data = data[newStart:newEnd]
- btwnData = data[end:newStart]
- joinData = data[start:newEnd]
-
- jMax = np.max(joinData)
- p1Max = np.max(peak1Data)
- p2Max = np.max(peak2Data)
- bMin = SS.scoreatpercentile(btwnData, 5) #don't use the minimum to avoid
extreme division, also, the mean is not the best way <---- baaad
-
+ peak1Data = data[start:end] # data for the first peak
+ peak2Data = data[newStart:newEnd] # data for the second peak
+ btwnData = data[end:newStart] # data between peaks
+ joinData = data[start:newEnd] # data as the peaks were one
+
+ jMax = np.max(joinData) # the max height for the joined peak
+ p1Max = np.max(peak1Data) # the max height for the first peak
+ p2Max = np.max(peak2Data) # the max height for the second peak
+ bMin = SS.scoreatpercentile(btwnData, 5) # don't use the minimum to
avoid extreme division,
+ # get the 5th percentile

if bMin == 0:
+ # These peaks are obviously separated :-)
return False

if bMin <= jMax * options.peakRatio:
@@ -155,28 +149,34 @@
# peaks are too far, don't join
return False

+ # Ok, join them!
return True

+
def findDataPeaks(data, options):
# This is the real find peaks routine, as data have been
- # splitted postive and negative...
-
- nW = options.expWindow
- k = options.peakInterspersion
- interMaxDist = nW * k # this is, somehow, the max distance we can expect
between two peaks
- e = 1e-10
- FlexusIndx = np.array([], dtype = np.int32)
- FlexAsc = np.array([], dtype = np.int32)
- FlexDes = np.array([], dtype = np.int32)
- peakBounds = []
- grossPeaks = []
+ # splitted postive and negative... Splitting is necessary to treat the
negative values without
+ # any particular modification to the routine
+
+ nW = options.expWindow # the signal size
+ k = options.peakInterspersion
+ interMaxDist = nW * k # The max distance we can expect between two
peaks
+ epsilon = 1e-30
+ FlexusIndx = np.array([], dtype = np.int32) # Where peak boundaries will
be stored at first
+ FlexAsc = np.array([], dtype = np.int32) # Where peak starts will be
stored
+ FlexDes = np.array([], dtype = np.int32) # Where peak ends will be
stored
+ grossPeaks = [] # A peak list to be processed further

Debug(1, timestring(), "Calculating Sobel filter")
+ # The sobel filter is necessary to get the flexus points for our data.
Flexus point
+ # Will be considered as peak boundaries. Sobel filter is a convenient
way to perform
+ # second derivative of original data
sobelFilter = scipy.ndimage.filters.sobel(data)
+ # We smooth the filter otherwise we get a ton of boundaries!!!
sobelFilter = scipy.ndimage.filters.gaussian_filter1d(sobelFilter,
int(10 * np.log10(nW)))

Debug(1, timestring(), "Getting maxima and minima")
- # I need minima to split peaks later on
+ # We need minima to split peaks later on
# Minima and maxima for original data are when the filter crosses zero
dataMinMax = DSP.fastZero(sobelFilter + e)

@@ -185,13 +185,20 @@
# for filter... we may apply another sobel but no...

if len(dataMinMax) == 0:
+ # Oddly enough, there were no peaks... we add two values so that there
will be a
+ # single huge peak
dataMinMax = np.append(0, len(data) - 1)

+ # Ensure that the first point is 0 and the last is the last data point...
if dataMinMax[0] != 0: dataMinMax = np.append(0, dataMinMax)
if dataMinMax[-1] < len(data) - 1: dataMinMax = np.append(dataMinMax,
len(data) - 1)

Debug(1, timestring(), "Getting peak boundaries")
for x in xrange(len(dataMinMax) - 1):
+ # For all maxima and minima, get the max and the min position
+ # in the sobel filter... these points are the flexus points on
original data
+ # Also, max(sobel) are the ascending flexa (peak start), min(sobel)
are the descending
+ # (peak ends)
pos1 = dataMinMax[x]
pos2 = dataMinMax[x + 1]
window = sobelFilter[pos1:pos2]
@@ -200,31 +207,35 @@
FlexAsc = np.append(FlexAsc, np.argmax(window) + pos1)
else:
FlexDes = np.append(FlexDes, np.argmin(window) + pos1)
- del(sobelFilter) #free some memory
-
- nPeaks = len(FlexAsc)
- Debug(2, timestring(), "Refining and merging", nPeaks, 'peaks')
+ # We don't need the Sobel Filter anymore, delete and free some memroy
+ del(sobelFilter)
+
+ # Now it's time to merge peaks that are likely part of the same enriched
region
+ nPeaks = float(len(FlexAsc))
+ Debug(2, timestring(), "Refining and merging", int(nPeaks), 'peaks')

#initialize
- x = 0
- y = 1
- fxe1 = -1
- fxe2 = -1
- fxs1 = 0
- fxs2 = 0
- tmpfx = 0
- nPeaks = float(nPeaks)
+ x = 0 # Counter for the leftmost peak
+ y = 1 # Counter for the y-th peak at right
+ fxe1 = -1 # End position for the first peak
+ fxe2 = -1 # End position for the second peak
+ fxs1 = 0 # Start position for the first peak
+ fxs2 = 0 # Start position for the second peak
+ tmpfx = 0 # A temporary flexus point...
while x < len(FlexAsc):
+ # Iterate as long as there are peaks
percComplete = int(np.ceil(x / nPeaks * 100))

try:
- fxs1 = FlexAsc[x]
- fxs2 = FlexAsc[x + y]
+ fxs1 = FlexAsc[x] # Get the first starting point
+ fxs2 = FlexAsc[x + y] # Get the next useful peak
except IndexError:
+ # Ok... this is just a check to make the while loop end
x += 1
continue

try:
+ # Get the first end point at the right of the fxs1 (start point of
the first peak)
tmpfx = FlexDes[FlexDes > fxs1][0]
except IndexError:
# don't know how to handle this... this means that there's a start
without an end
@@ -235,6 +246,7 @@
if tmpfx > fxe1:
# fxe1 should be updated everytime the peak grows
# otherwise set it to the first flexus after the first start
+ # We fall in this condition at least the first iteration as fxe1 has
been initialized to -1
fxe1 = tmpfx
try:
fxe2 = FlexDes[FlexDes > fxs2][0]
@@ -254,27 +266,31 @@
if growPeak(data, fxs1, fxe1, fxs2, fxe2, interMaxDist, options):
# these belong to the same peak check the next
y += 1
- # update the end
+ # update the end of the joined peak
fxe1 = fxe2
else:
# we have two separate peaks, add the first to the list
# and start over the second
- pHeight = np.max(data[fxs1:fxe1])
- pArea = np.sum(data[fxs1:fxe1])
- grossPeaks.append([fxs1, fxe1, 0.5, pArea, pHeight])
+ pHeight = np.max(data[fxs1:fxe1]) # the actual height
+ pArea = np.sum(data[fxs1:fxe1]) # the actual area, calculated
as sum
+ # this should be an integral ^.^
+ grossPeaks.append([fxs1, fxe1, 0.5, pArea, pHeight]) # append the
defined peak with a dummy p-value
x = x + y
y = 1
Debug(2, timestring(), "Refining and merging",
len(FlexAsc), 'peaks', percComplete, "%")
Debug(1)
return grossPeaks

+
def assignProbabilities(data, grossPeaks, options):
# a separate routine to assign peak probabilities...
- peakBounds = []
- peakAreaList = np.array([])
- nPeaks = float(len(grossPeaks))
-
- if 'Z' in options.pipeline:
+ # remember the grossPeaks organization:
+ # [[start, end, p-value, area, height], [start, end, p-value, area,
height], ...]
+ peakBounds = [] # The final peak list
+ peakAreaList = np.array([]) # The peak areas
+ nPeaks = float(len(grossPeaks)) # Again, number of peaks...
+
+ if options.zerodata:
# we have removed negative values, we are interested only in
# enriched data, so this is a 1-tail p estimation
oneTail = True
@@ -282,19 +298,26 @@
oneTail = False

if nPeaks == 0:
+ # Why on earth there should be no peaks? Don't know, but return an
empty array
return peakBounds

- peakAreaList = np.array([np.sum(data[x[0]:x[1]]) for x in grossPeaks])
+ # We already have peak areas, just put them into an np.array... those
are very useful
+ peakAreaList = np.array([x[3] for x in grossPeaks])

if options.distType == 'hist':
+ # Calculate p-value simply evaluating the data distribution. This is
not necessary the
+ # best way to do things. It should be said that I don't know a priori
the data distribution
+ # model. Also, I don't believe poisson or negative binomial is the
proper distribution model
+ # I think some hyperbolic or hyperbolic-like model would fit better...
+ # Estimate the number of bins
nbins = np.ceil(np.sqrt(nPeaks))
if nbins < 10:
- # if we have less than 30 bins
+ # Just in case if we have less than 10 bins
nbins = 10
-
- # get the list of peak areas. Here peak area is the sum, I may
- # switch to proper integrals in the future...
- # also assign area to each peak
+
+ # When using histogram, there's a major part of data among the
expected value... getting
+ # p-value for extreme values will raise very low values, we then
perform a log transform
+ # just to better separate data. Note that we have to ensure that the
min value is positive
minCorrect = np.abs(np.min(peakAreaList)) + 1
peakAreaList = np.log(peakAreaList + minCorrect) #put everything above
0 and set logarithmic scale

@@ -309,6 +332,9 @@
# should I multply the p of each single peak?
cy = y.cumsum() / y.sum()
elif options.distType == 'wald':
+ # This uses the inverse gaussian distribution... In my opinion this is
the best way to
+ # describe data... unfortunately results suggest I'm wrong. It should
be said that I'm
+ # having trobles in estimating correct parameters...
(xmA, lA) = estimateWaldParameters(peakAreaList[peakAreaList >= 0])
if np.isnan(lA):
Debug(1, timestring(), "Warning, switching to hist dist type because
of errors...")
@@ -333,13 +359,17 @@
xrC = np.linspace(np.min(peakAreaList), np.max(peakAreaList), num=nA
+ nB - 1)
pdfB = dB.pdf(xrB)

- #crossCorrelate
+ # Once we have the pdf for both peak series, we crosscorrelate them.
This produces
+ # the correct distribution of data. I'm doing this because I know
that single peak
+ # series are distributed in different ways. I don't know which is
the correct
+ # joint distribution, I need to cross-correlate to have it...
pdfC = SG.correlate(pdfA, pdfB)
else:
xrC = xrA
pdfC = pdfA
cy = pdfC.cumsum() / pdfC.sum()
elif options.distType == 'power':
+ # The same old story... this time is a Pareto distribution.
(alphaA, xmA) = estimatePowerParameters(peakAreaList[peakAreaList >=
0])
if np.isnan(xmA):
Debug(1, timestring(), "Warning, switching to hist dist type because
of errors...")
@@ -376,7 +406,6 @@
# we calculate p-value as 2-tailed distribution
# in princicple we can't assume a positive value to be at the right
tail
# and viceversa. Just see if the pvalue is higher or lower than 0.5.
-
# to have a pvalue check the cumulative distribuition, look for values
having a
# area higher than this and get the first item (leftmost). This should
be the p I'm looking
# for...
@@ -398,35 +427,40 @@
pvalue = 1 - cumPvalue
else:
pvalue = cumPvalue
+ # In the end we have a p-value. Apparently I'm not able to directly
set it into
+ # grossPeaks... that's way I'm using an additional array...
peakBounds.append([peak[0], peak[1], pvalue, peak[3], peak[4]])
Debug(2, timestring(), "Calculating peak probabilities",
percComplete, "%")
Debug(1)
return peakBounds

def findPeaks(data, options):
- # we are going to split data
+ # To find peaks we first split the data into positive and negative values
+ # this allows us to use the very same routines without caring what we
are doing :-)
if options.zerodata:
- # already zeroed data, get these
+ # There's no need to split, we already removed negative values
pbounds = findDataPeaks(data, options)
else:
- Debug(1, timestring(), "Working on signal A")
+ Debug(1, timestring(), "Working on positive values")
subData = np.zeros(len(data))
mask = data > 0
subData[mask] = data[mask]
pbounds = findDataPeaks(subData, options)
- #negative values
- if options.sigB:
- # There's a control...
- Debug(1, timestring(), "Working on signal B")
- #reset subData
- subData[mask] = 0
- #get a new mask for negative values
- mask = data < 0
- # get new subData and set them positive
- subData[mask] = -data[mask]
- # find peaks and add them to previous list
- # remember that area and height should be set negative
- pbounds = pbounds + [[x[0], x[1], x[2], -x[3], -x[4]] for x in
findDataPeaks(subData, options)]
+ # negative values can arise from control processing or from
mathematical issue
+ # i.e. FFT or wavelet introduce negative values.
+ # in the past I used to check negative values only when the control is
given
+ # to avoid math issues. The problem is that I now can feed dspchip
with processed
+ # signals that may have negative values per se.
+ Debug(1, timestring(), "Working on negative values")
+ # reset subData
+ subData[mask] = 0
+ # get a new mask for negative values
+ mask = data < 0
+ # get new subData and set them positive
+ subData[mask] = -data[mask]
+ # find peaks and add them to previous list
+ # remember that area and height should be set negative
+ pbounds = pbounds + [[x[0], x[1], x[2], -x[3], -x[4]] for x in
findDataPeaks(subData, options)]
del(subData)
pbounds.sort()
pbounds = assignProbabilities(data, pbounds, options)
@@ -483,6 +517,9 @@
return stats

+# From this point on there are commented function that used to be part of
the code
+# I don't erase them as I may resume some in the future... they're like
zombies!
+
#def filterPeaks(data, peaks, window):
# return np.asarray([x for x in peaks if mad(data[x[0]:x[1]]) >=
mad(data[x[0]- 2 * window:x[1]+ 2 * window])])
#

Reply all

Reply to author

Forward

0 new messages