[dspchip] r233 committed - Latest cleaning...

5 views
Skip to first unread message

dsp...@googlecode.com

unread,
Feb 11, 2011, 9:16:16 AM2/11/11
to dspchi...@googlegroups.com
Revision: 233
Author: daweonline
Date: Fri Feb 11 06:15:31 2011
Log: Latest cleaning
added convolution and cross-correlation between signals
added some debugging facilities

http://code.google.com/p/dspchip/source/detail?r=233

Modified:
/trunk/src/dclib/DSP.py
/trunk/src/dclib/IO.py
/trunk/src/dclib/NumAnalysis.py
/trunk/src/dspchip
/trunk/src/setup.py

=======================================
--- /trunk/src/dclib/DSP.py Fri Feb 11 03:06:18 2011
+++ /trunk/src/dclib/DSP.py Fri Feb 11 06:15:31 2011
@@ -38,113 +38,85 @@
import bx.wiggle

def fastZero(dog, ext=1):
+ # dog, here, is Difference Of Gaussians.
+ # BTW you don't need a dog for this routine to get the zero-crossing
points...
return np.where(dog[:-ext] * dog[ext:] <= 0 )[0]

def mad(data):
+ # Calculate the Median Absolute Deviation
+ # don't consider nan values, they mess any calculation!
data = data[np.isnan(data) ^ True]
median = np.median(data)
absmed = np.abs(data - median)
return np.median(absmed)

def baseline(data, blineMethod, expWindow):
+ # Caluclate the data baseline.
l = len(data)
step = expWindow

if blineMethod.lower() == "mad":
- #simply calculate MAD for all chromosome and subtrac 2*MAD
+ # simply calculate MAD for all chromosome and subtract 2*MAD
+ # This is quite stupid, but it works sometime
smad = mad(data)
data = data - smad * 2
return data
elif blineMethod.lower() == "min":
# get the median of minima for windowed data and subtract it
+ # this is the default method
+ # we start resizing data into a step-wide matrix
if step > l: step = l
if l % step == 0:
tmp = np.resize(data, (l/step, step))
else:
+ # there may be a "tail", append the last value to
+ # have a full row... This doesn't change the min for that
+ # row...
tlen = step - l % step
tail = np.zeros(tlen) + data[-1]
tmp = np.append(data, tail)
tmp = np.resize(tmp, (l/step + 1, step))
minima = np.minimum.reduce(tmp, 1)
-
# assume that the baseline is the same for the entire signal...
b = data - np.median(minima)
-# b[ b < 0 ] = 0
return b[:len(data)]
elif blineMethod.lower() == "hist":
# ok, this is nice, based on data values
- nbins = len(data) / (2 * expWindow)
+ nbins = np.ceil(np.sqrt(len(data)))
if nbins < 100:
nbins = 100
hist, bins = np.histogram(data, bins=nbins, normed=True)
# get the highest peak, that is the most distributed value
# and then get the value between it and the next bin,
# use this as baseline for all the chromosome
+ # this is not a baseline correction, this is a thresholding function!
bin = np.argmax(hist)
threshold = (bins[bin] + bins[bin + 1]) / 2
b = data - threshold
-# b[ b < 0 ] = 0
return b[:len(data)]
else:
Debug(1, timestring(), blineMethod, "is not a known function, skipping
baseline correction")
return data

-def buildWavelet(window_len=16, window="flattop"):
- dec_lo = eval('scipy.signal.'+window+'(window_len)')
- dec_lo = dec_lo * np.sqrt(2) / sum(dec_lo)
- dec_lo = np.append(np.zeros(window_len/4), dec_lo)
- dec_lo = np.append(dec_lo, np.zeros(window_len/4*3))
- rec_lo = dec_lo[::-1]
-# rec_hi = np.array(dec_lo)
-# for i in range(len(dec_lo)):
-# if i % 2 == 1:
-# rec_hi[i] = -rec_hi[i]
- dec_hi = getDoG(dec_lo)
- rec_hi = dec_hi[::-1]
- w = pywt.Wavelet(name=window, filter_bank = [dec_lo, dec_hi, rec_lo,
rec_hi])
-# w.orthogonal = True
- return w

def getDoG(data, s1 = 1, s2 = 4):
+ # Calculate the difference of gaussians. Useful for edge detection?
gs1 = scipy.ndimage.gaussian_filter1d(data, s1)
gs2 = scipy.ndimage.gaussian_filter1d(data, s2)
return gs2 - gs1

-def fillSparse(sparsedata, length, step=10):
- outdata = np.zeros(length)
- x = 0
- for i in range(step):
- try:
- outdata[i::step] = sparsedata[:len(sparsedata) - x]
- except ValueError:
- x += 1
- outdata[i::step] = sparsedata[:len(sparsedata) - x]
- return outdata
-
-#def smooth(x,window_len=10,window='hanning'):
-# if x.ndim != 1:
-# raise ValueError, "smooth only accepts 1 dimension arrays."
-# if x.size < window_len:
-# raise ValueError, "Input vector needs to be bigger than window size."
-# if window_len<3:
-# return x
-# if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
-# raise ValueError, "Window is on
of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
-# s=np.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]
-# if window == 'flat': #moving average
-# w=ones(window_len,'d')
-# else:
-# w=eval('np.'+window+'(window_len)')
-# y=np.convolve(w/w.sum(),s,mode='same')
-# return y[window_len-1:-window_len+1]

def gsmooth(data, expWindow):
Debug(1, timestring(), "WARNING: Gaussian filter processing may drain
your RAM")
smoothFactor = int(1 + np.log10(expWindow))
return scipy.ndimage.gaussian_filter1d(data, smoothFactor)

+
def fftsmooth(data, expWindow):
+ # Smooth data using a fast Fourier transform...
Debug(1, timestring(), "WARNING: FFT processing may drain your RAM")
+ # Get the fft window using power of 2. Window with a different
+ # size are veeeery slow. The problem is that a chromosome is not 2^n
long...
fftwindow = 2
while fftwindow < expWindow:
fftwindow = fftwindow << 1
@@ -152,6 +124,8 @@
while fftwindow > len(data):
fftwindow = fftwindow >> 1
Debug(0, timestring(), "FFT window size:", fftwindow)
+ # This is why we perform fft 5'->3' and 3'->5'...
+ # in the end we wil merge data
tail = np.zeros(fftwindow - len(data) % fftwindow)
tpad = np.append(data, tail)
tpad = tpad.reshape(len(tpad)/ fftwindow, fftwindow)
@@ -173,6 +147,7 @@
return (tpad.ravel()[:len(data)] + hpad.ravel()[-len(data):]) / 2.0

def autocorrelate(data, expWindow, wstep):
+ # Autocorrelation of the signal...
window = expWindow
step = wstep
l = len(data)
@@ -199,9 +174,50 @@
return intrpData


+def corrconv(data1, data2, expWindow, wstep, convolve = False):
+ # Cross-correlate or convolve two signals, to obtain a new
cross-correlated or convolved signal
+ # xcorrelation is much like a subtraction, convolution the opposite?
+ # assume data1 and data2 have same length
+ window = expWindow
+ step = wstep
+ l = len(data1)
+ if l != len(data2):
+ Debug(5, "Both data should have the same length for this
cross-correlation")
+ #pad data
+ data1 = np.append(data1, np.zeros(window / 2))
+ data1 = np.append(np.zeros(window / 2), data1)
+ data2 = np.append(data2, np.zeros(window / 2))
+ data2 = np.append(np.zeros(window / 2), data2)
+ Debug(1, timestring(), "Autocorrelation: window size", expWindow, "step
size:", wstep)
+ if convolve:
+ # I know I can use -data2 to convolve instead of a separate np call...
but I trust numpy...
+ y = np.array([np.convolve(data1[x:x+window], data2[x:x+window],
mode='valid') for x in np.arange(0, len(data1) - window, step)]).ravel()
+ else:
+ y = np.array([np.correlate(data1[x:x+window], data2[x:x+window],
mode='valid') for x in np.arange(0, len(data1) - window, step)]).ravel()
+ x = np.arange(0, l, step)
+ Debug(1,timestring(), "Getting the linear interpolation")
+ s = scipy.interpolate.interp1d(x, y, bounds_error=False, fill_value=0)
+ del(x)
+ del(y)
+ if convolve:
+ Debug(1,timestring(), "Rebuilding the convolved signal")
+ else:
+ Debug(1,timestring(), "Rebuilding the cross-correlated signal")
+
+ # Interpolation
+ intrpData = np.array([s(np.arange(x, x + expWindow)) for x in
np.arange(0, l, expWindow)]).ravel()
+
+ if len(intrpData) > l:
+ return intrpData[:l]
+ trimL = l - len(intrpData)
+ intrpData = np.append(intrpData, s(np.arange(trimL,l)))
+ return intrpData
+
+
def wltdenoise(data, wavelet, expWindow):
- nchunks = 10
- wlt = None
+ # Use wavelets to remove noise
+ wlt = None # The wavelet to use
+ # Instantiate the wavelet
try:
wlt = pywt.Wavelet(wavelet)
except ValueError:
@@ -211,16 +227,29 @@
if not wlt:
Debug(5, "there was an error in building the wavelet")
Debug(1, timestring(), "Getting wavelet coefficients for wavelet",
wavelet)
+
+ # Estimate which is the level we will use, according to the expected
signal size
useLevel, wlen = estimateWaveLevel(wlt, expWindow)
+
+ # Perform the wavelet decomposition
coeffs = pywt.wavedec(data, wlt, level=useLevel, mode='sp1')
Debug(1, timestring(), "Filtering coefficients. Decomposition level:",
useLevel, "Wavelet peak width:", wlen)
+
+ # Set all the useless level to zero before recomposition
for i in range(1, len(coeffs)):
coeffs[i] *= 0
Debug(1, timestring(), "Rebuilding signal")
+
+ # Get the signal back!
signal = pywt.waverec(coeffs, wlt, mode='sp1')
return signal

+
def estimateWaveLevel(wavelet, expWindow):
+ # Instead of using the wavelet psi function length
+ # we check the "useful" wavelet portion, i.e. the
+ # at which level the size of the wavelet above
+ # zero is comparable with the expected signal size...
f = wavelet.wavefun()[0]
l = len(f)
z = fastZero(f)
@@ -244,22 +273,24 @@
level += 1
return rLevel, l /2

+
def equalize(data, expWindow, minimum = 0, maximum = 1):
-# sorted_data = np.sort(data)
-# nbins = np.ceil(np.log2(len(data)) + 1)
-# nbins = np.ceil(2 * (scipy.stats.scoreatpercentile(data, 75) -
scipy.stats.scoreatpercentile(data, 25)) / np.power(len(data), 1.0/3))
- nbins = np.ceil(len(data) / (2 * expWindow))
+ # OMG! is this a good way to do things? In princple we bin data
+ # and then get the cumulative sum
+ # and the set it to a specified range
+ # and then interpolate data on that...
+ nbins = np.ceil(np.sqrt(len(data)))
if nbins < 100:
nbins = 100

hist, bins = np.histogram(data, nbins, normed=True)
-# cdf = sorted_data.cumsum()
cdf = hist.cumsum()
cdf /= cdf[-1]
cdf = cdf * (maximum - minimum) + minimum

return np.interp(data, bins[:-1], cdf)

+
def normalize(data, mode='mean'):
if mode=='mean':
av = np.mean(data)
@@ -268,8 +299,10 @@
av = np.median(data)
st = mad(data)
return (data - av) / st
+

def correlateSignals(data1, data2, expWindow):
+ # this is to calculate linear regression and correlation coefficient of
data
l = len(data1) #assume data1 and data2 are equal size :-)
m = l % expWindow

@@ -298,3 +331,53 @@
(ccoef, cpv, m, b, r, tt, error) = (0,0,0,0,0,0,0)

return (ir1, ir2,(ccoef, cpv), (m, b, r, tt, error))
+
+
+# Here starts the zombie function zone...
+
+
+#def fillSparse(sparsedata, length, step=10):
+# outdata = np.zeros(length)
+# x = 0
+# for i in range(step):
+# try:
+# outdata[i::step] = sparsedata[:len(sparsedata) - x]
+# except ValueError:
+# x += 1
+# outdata[i::step] = sparsedata[:len(sparsedata) - x]
+# return outdata
+#
+#def smooth(x,window_len=10,window='hanning'):
+# if x.ndim != 1:
+# raise ValueError, "smooth only accepts 1 dimension arrays."
+# if x.size < window_len:
+# raise ValueError, "Input vector needs to be bigger than window size."
+# if window_len<3:
+# return x
+# if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
+# raise ValueError, "Window is on
of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
+# s=np.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]
+# if window == 'flat': #moving average
+# w=ones(window_len,'d')
+# else:
+# w=eval('np.'+window+'(window_len)')
+# y=np.convolve(w/w.sum(),s,mode='same')
+# return y[window_len-1:-window_len+1]
+#def buildWavelet(window_len=16, window="flattop"):
+# # This function will be useful when I will put custom wavelets
+# dec_lo = eval('scipy.signal.'+window+'(window_len)')
+# dec_lo = dec_lo * np.sqrt(2) / sum(dec_lo)
+# dec_lo = np.append(np.zeros(window_len/4), dec_lo)
+# dec_lo = np.append(dec_lo, np.zeros(window_len/4*3))
+# rec_lo = dec_lo[::-1]
+## rec_hi = np.array(dec_lo)
+## for i in range(len(dec_lo)):
+## if i % 2 == 1:
+## rec_hi[i] = -rec_hi[i]
+# dec_hi = getDoG(dec_lo)
+# rec_hi = dec_hi[::-1]
+# w = pywt.Wavelet(name=window, filter_bank = [dec_lo, dec_hi, rec_lo,
rec_hi])
+## w.orthogonal = True
+# return w
+
+
=======================================
--- /trunk/src/dclib/IO.py Fri Feb 11 03:06:18 2011
+++ /trunk/src/dclib/IO.py Fri Feb 11 06:15:31 2011
@@ -43,17 +43,18 @@
for s in Steps:
sidx = options.pipeline.find(s)
if sidx >= 0:
+ Debug(1, "Warning: step", s, "has been removed from your pipeline")
options.pipeline = options.pipeline[:sidx] + options.pipeline[sidx +
1:]


def prepareOptions():
usage= "%prog <-i file> [options]"
description = "A tool to analyze genomic enrichment data inspired by DSP
techniques"
- theParser = OptionParser(version="%prog 0.7.7",
description=description,usage=usage)
+ theParser = OptionParser(version="%prog 0.7.8",
description=description,usage=usage)

theParser.add_option("-i","--sigA",dest="sigA",type="string",help="The
first signal file (ChIP)", default=None)
theParser.add_option("-c","--sigB",dest="sigB",type="string",help="The
second signal file (Control)", default=None)
- theParser.add_option("--pl",dest="pipeline",type="string",help="The
signal analysis pipeline string. Each step can be [F]FT Transform,
[A]utocorrelation, [W]avelet denoise, [G]aussian smoothing, [E]qualize,
[N]ormalize, [S]ubtract, [R]ratio, [B]aseline, [Z]ero min. Example: FEBN
will perform FFT, equalization, baseline subtraction and normalization in
this order", default="LWZ")
+ theParser.add_option("--pl",dest="pipeline",type="string",help="The
signal analysis pipeline string. Each step can be [F]FT Transform,
[A]utocorrelation, [W]avelet denoise, [G]aussian smoothing, [X]-correlate,
con[V]olve, [E]qualize, [N]ormalize, [S]ubtract, [R]ratio, [B]aseline,
[Z]ero min. Example: FEBN will perform FFT, equalization, baseline
subtraction and normalization in this order", default="LWZ")

theParser.add_option("--winfun",dest="windowingFunction",type="string",help="Windowing
function for profile output [mean|med|max]", default="mean")
theParser.add_option("-n","--name",dest="name",type="string",help="The
experiment name", default="default")

theParser.add_option("-W","--wavelet",dest="wavelet",type="string",help="The
wavelet to use. This should be defined in pywt library", default="db5")
@@ -82,10 +83,17 @@

theParser.add_option("--noprofile",action="store_true",dest="noprof",help="Don't
save a profile file", default=False)

theParser.add_option("--save",action="store_true",dest="npysave",help="Save
chromosome data into npy files", default=False)

theParser.add_option("--stats",action="store_true",dest="savestats",help="Save
basic statistics for processed signals", default=False)
-
-
-
return theParser
+
+
+def fakeOptions():
+ theParser = prepareOptions()
+ # This routine should be intended for interactive usage... When you
don't have time
+ # to write a full args list in ipython... It fills options with standard
values
+ # the users have only to change what they want
+ (options, args) = theParser.parse_args([])
+ return options
+

def parseOptions(theParser):
(options,args) = theParser.parse_args()
@@ -124,16 +132,16 @@
options.pipeline = options.pipeline.upper()

# check that options that reduce to a single dataset are chosen one at
time
- if 'R' in options.pipeline and 'S' in options.pipeline:
- Debug(5, "Subtract and Ratio steps in pipeline are mutually exclusive")
- if 'R' in options.pipeline and 'L' in options.pipeline:
- Debug(5, "Log2ratio and Ratio steps in pipeline are mutually
exclusive")
- if 'S' in options.pipeline and 'L' in options.pipeline:
- Debug(5, "Subtract and Ratio steps in pipeline are mutually exclusive")
+ reducingSteps = 0
+ for step in options.pipeline:
+ if step in 'SLRXV':
+ reducingSteps += 1
+ if reducingSteps > 1:
+ Debug(5, "You specified", reducingSteps, "steps that reduce the number
of signals, only one allowed")

if not options.sigB:
# remove subtract from pipeline if control is not present
- removePipelineSteps(options, 'SLR')
+ removePipelineSteps(options, 'SLRXV')

#controls on format
if not options.formatB:
=======================================
--- /trunk/src/dclib/NumAnalysis.py Fri Feb 11 05:21:08 2011
+++ /trunk/src/dclib/NumAnalysis.py Fri Feb 11 06:15:31 2011
@@ -45,6 +45,17 @@
return data


+def PPLXCorr(data1, data2, options):
+ convolve = False
+ if 'V' in options.pipeline:
+ convolve = True
+ Debug(1, timestring(), "About to perform signal convolution")
+ else:
+ Debug(1, timestring(), "About to perform signal cross-correlation")
+ data = DSP.corrconv(data1, data2, options.expWindow, options.wstep,
convolve)
+ return data
+
+
def PPLFFT(data, options):
Debug(1, timestring(), "About to perform FFT smooth")
data = DSP.fftsmooth(data, options.expWindow)
=======================================
--- /trunk/src/dspchip Fri Feb 11 03:06:18 2011
+++ /trunk/src/dspchip Fri Feb 11 06:15:31 2011
@@ -32,7 +32,6 @@
import numpy as np

def main():
-
# get command line options and print them
options = dclib.IO.parseOptions(dclib.IO.prepareOptions())
dclib.IO.debugParameters(options)
@@ -136,6 +135,13 @@
OneDataSet = True
if options.npysave:
dclib.IO.npysave(theSignalA, options.name,
chrom, "SignalA", "diff")
+ elif analysisStep == 'X' or analysisStep == 'V':
+ # Cross-correlate or convolve...
+ theSignalA = dclib.NumAnalysis.PPXCorr(theSignalA, theSignalB,
options)
+ del(theSignalB)
+ OneDataSet = True
+ if options.npysave:
+ dclib.IO.npysave(theSignalA, options.name,
chrom, "SignalA", "xcorr")
elif analysisStep == 'R':
# Calculate data ratio, i.e. (A-B)/A
theSignalA = dclib.NumAnalysis.PPLRatio(theSignalA, theSignalB,
options)
=======================================
--- /trunk/src/setup.py Thu Feb 10 12:48:41 2011
+++ /trunk/src/setup.py Fri Feb 11 06:15:31 2011
@@ -1,7 +1,7 @@
from distutils.core import setup

setup(name='dspchip',
- version='0.7.7',
+ version='0.7.8',
author='Davide Cittaro',
author_email='daweo...@gmail.com',
url='http://code.google.com/p/dspchip',

Reply all
Reply to author
Forward
0 new messages