Pearson correlation coefficient is just the regular correlation,
numpy.corrcoef plus the t-statistic for the test that the correlation
coefficient is zero.
I'm not familiar enough with the sparse package to know how the
details work, but in my first try, `mean` seems strange to me
>>> B
<4x4 sparse matrix of type '<type 'numpy.int32'>'
with 4 stored elements in Compressed Sparse Row format>
>>> B.todense()
matrix([[3, 0, 1, 0],
[0, 2, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 1]])
>>> B.todense().mean(axis=0)
matrix([[ 0.75, 0.5 , 0.25, 0.25]])
>>> B.mean(axis=0)
matrix([[0, 0, 0, 0]])
>>> B.todense().mean(axis=1)
matrix([[ 1. ],
[ 0.5 ],
[ 0. ],
[ 0.25]])
>>> B.mean(axis=1)
matrix([[1],
[0],
[0],
[0]])
>>> B.sum(axis=1)
matrix([[4],
[2],
[0],
[1]])
>>> B.sum(axis=0)
matrix([[3, 2, 1, 1]])
Here is my version of sparse corrcoef and cov, that takes also zero
points into account, i.e. it is the same as using
np.cov(sparsematrix.todense()) but (I hope) it avoids any dense
calculation on the original matrix:
import numpy as np
from scipy import sparse, stats
# example from doc strings, help
I = np.array([0,0,1,3,1,0,0])
J = np.array([0,2,1,3,1,0,0])
V = np.array([1,1,1,1,1,1,1])
B = sparse.coo_matrix((V,(I,J)),shape=(4,4)).tocsr()
def covcsr(x):
'''return covariance matrix, assumes column variable'''
meanx = x.sum(axis=0)/float(x.shape[0])
return ((x.T*x)/x.shape[0] - meanx.T*meanx)
def corrcoefcsr(x):
covx = covcsr(x)
stdx = np.sqrt(np.diag(covx))[np.newaxis,:]
return covx/(stdx.T * stdx)
B1 = B[:,:2]
B1d = B1.todense()
print 'sparse cov:\n', covcsr(B1)
print 'np.cov:\n', np.cov(B1d, rowvar=0, bias=1)
print 'sparse corrcoef:\n', corrcoefcsr(B1)
print 'np.corrcoef:\n', np.corrcoef(B1d, rowvar=0, bias=1)
print 'stats.pearsonr:', stats.pearsonr(B1d[:,0],B1d[:,1])
Josef
_______________________________________________
SciPy-user mailing list
SciPy...@scipy.org
http://mail.scipy.org/mailman/listinfo/scipy-user
Sparse probably uses an integer accumulator for these functions, whereas
normal numpy arrays use a float accumulator.
Probably a bug: http://projects.scipy.org/scipy/ticket/884
--
Pauli Virtanen
Yep, this is something we'll address in SciPy 0.8:
http://projects.scipy.org/scipy/ticket/658
As a work around, you could do
>>> B * ones((B.shape[1],1), dtype='float64')
and scipy.sparse will upcast B's data array accordingly. This is
somewhat more efficient than
>>> B.astype('float64').sum(axis=0)
--
Nathan Bell wnb...@gmail.com
http://graphics.cs.uiuc.edu/~wnbell/
I don't know what the interpretation of zero values in the sparse
matrix would be in this case, but to me it looks like you are using
inconsistent sample size n (where at least one of two is non-zero)
while the mean is based on a smaller n (non-zero for one vector). I
also thought that, how the zeros in one vector are treated, depends on
the zeros in the second vector, but maybe not because you subtract the
mean only from the non-zero values x.data-mx, y.data-my and zeros in
the dot product shouldn't matter.
According to my interpretation, you are replacing the zero elements in
the joint vectors (size n) by the means of each vector. Under this
interpretation the sample size and the degrees of freedom might be
correct. I don't know if this is the desired behavior.
I updated my version of the dense equivalent correlation a bit:
import numpy as np
from scipy import sparse, stats
I = np.array([0,3,1,0,4,5,5])
J = np.array([0,3,1,2,3,0,1])
V = np.array([4,5,7,9,3,1,2])
B = sparse.coo_matrix((V,(I,J)),shape=(6,4)).tocsr()
def cov_csr(x, axis=0):
'''return covariance matrix, assumes column variable
return type ndarray'''
meanx = x.sum(axis=axis)/float(x.shape[axis])
if axis == 0:
return np.array((x.T*x)/x.shape[axis] - meanx.T*meanx)
else:
return np.array((x*x.T)/x.shape[axis] - meanx*meanx.T)
def corrcoef_csr(x, axis=0):
'''correlation matrix, return type ndarray'''
covx = cov_csr(x, axis=axis)
stdx = np.sqrt(np.diag(covx))[np.newaxis,:]
return covx/(stdx.T * stdx)
def pearsonr_csr(x, axis=0):
'''returns correlation coefficient or matrix,
t-statistic and p-value for null hypothesis of zero correlation
(of pairwise correlation if more than 2 variables, columns)'''
r = corrcoef_csr(x, axis=axis)
TINY = 1e-15
df = x.shape[axis]-2
t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
if t.shape[axis] == 2:
t = t[0,1]
r = r[0,1]
else:
t += np.diag(np.inf*np.ones(t.shape[0]))
return r, t, stats.t.sf(np.abs(t),df)*2
B1 = B[:,:2]
B1d = B1.todense()
print 'sparse cov:\n', cov_csr(B1)
print 'np.cov:\n', np.cov(B1d, rowvar=0, bias=1)
print 'sparse corrcoef:\n', corrcoef_csr(B1)
print 'np.corrcoef:\n', np.corrcoef(B1d, rowvar=0, bias=1)
r, t, p = pearsonr_csr(B1)
print 'sparse pearson r\n', r
print 'sparse t-test:t-stat\n', t
print 'sparse t-test:pvalue\n', p
rs, ps =stats.pearsonr(B1d[:,0],B1d[:,1])
print 'stats.pearsonr:', rs, ps
r0, t0, p0 = pearsonr_csr(B)
print 'sparse pearson r\n', r0
print 'sparse t-test:t-stat\n', t0
print 'sparse t-test:pvalue\n', p0
r1, t1, p1 = pearsonr_csr(B.T, axis=1)
##print 'sparse pearson r\n', r1
##print 'sparse t-test:t-stat\n', t1
##print 'sparse t-test:pvalue\n', p1
from numpy.testing import assert_equal, assert_almost_equal
assert_almost_equal(cov_csr(B1), np.cov(B1d, rowvar=0, bias=1), decimal=14)
assert_almost_equal(corrcoef_csr(B1), np.corrcoef(B1d, rowvar=0,
bias=1), decimal=14)
assert_almost_equal(r, rs, decimal=14)
assert_almost_equal(p, ps, decimal=14)
#test axis
assert_equal(r0, r1)
assert_equal(t0, t1)
assert_equal(p0, p1)
assert_equal(cov_csr(B), cov_csr(B.T, axis=1))
assert_equal(corrcoef_csr(B), corrcoef_csr(B.T, axis=1))
Josef