Dear Cython Users,
I am a very inexperienced Cython users and I am trying to understand how to speed up a little function - I seem to have tried everything I know but the Cython implementation I have is still between 5 and 50 times slower than a dumb Fortran+f2py routine I have written.
This little function simply finds the row and column indices and the nonzero values of a 2D matrix, which is relatively sparse.
That said, both the Cython and Fortran+f2py routines receive a dense, single precision, fortran-ordered 2D matrix of varying dimensions. Both routines don't return anything (on purpose, this is a very simplified version of the problem I have), but I think I have taken steps to avoid the compiler optimizing away stuff.
If I run both routines with the attached sample, I get this type of elapsed times:
Elapsed Times (ms)
M N NNZ Cython Fortran + F2PY Ratio
1661 2608 5833 25.0584 2.7079 9.3
2828 3512 7191 40.0933 6.4342 6.2
780 985 1657 6.7442 0.4403 15.3
799 1558 2772 10.8066 0.7142 15.1
399 540 908 3.1496 0.1289 24.4
10545 10486 23787 271.3768 70.9125 3.8
3369 3684 6962 42.6007 7.9357 5.4
2052 5513 6822 39.4725 7.2816 5.4
224 628 1555 4.8283 0.0944 51.1
553 1475 1849 7.4616 0.4713 15.8
Where M, N are the number of rows/columns of the matrix, NNZ the number of nonzero elements and then two columns for the elapsed time in ms, plus the ratio Cython/Fortran.
It may easily be that passing a single precision, fortran-ordered matrix to Cython is not the best idea, but this is the input I have to live with.
Any suggestion in making the Cython implementation more performant is most welcome - I attach all the scripts at the end of the email. Thank you in advance.
Andrea.
# CYTHON - cython_test_nnz.pyx
import numpy as np
cimport numpy as np
import cython
from cython.view cimport array as cvarray
cdef float SMALL = 1e-10
cdef int MAXNNZ = 1000000
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.nonecheck(False)
cpdef find_nnz(np.float32_t[::1, :] matrix):
cdef int n, m, i, j, nza
cdef float val1
col_starts = cvarray(shape=(MAXNNZ,), itemsize=sizeof(int), format='i')
row_indices = cvarray(shape=(MAXNNZ,), itemsize=sizeof(int), format='i')
x = cvarray(shape=(MAXNNZ,), itemsize=sizeof(double), format='d')
m = matrix.shape[0]
n = matrix.shape[1]
nza = 0
for i in range(n):
for j in range(m):
val1 = matrix[j, i]
if abs(val1) < SMALL:
continue
col_starts[nza] = i+1
row_indices[nza] = j
x[nza] = <double>val1
nza += 1
# Cython setup.py file
from setuptools import setup
from setuptools import Extension
from Cython.Build import cythonize
import numpy as np
if __name__ == '__main__':
extra_compile_args = ['-O3', '-ffast-math', '-funroll-loops', '-flto', '-ftree-vectorize']
extra_link_args = ['-flto'] + extra_compile_args
extensions = [Extension('cython_test_nnz', ['cython_test_nnz.pyx'],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
include_dirs=[np.get_include()],
define_macros=
[('WIN32', 1)] ]
# build the core extension(s)
setup_kwargs = {'ext_modules': cythonize(extensions)}
setup(**setup_kwargs)
# Fortran implementation
subroutine find_nnz(matrix, m, n)
implicit none
integer, intent(in):: m, n
real(4), intent(in), dimension(m, n) :: matrix
integer nza, i, j, maxnnz
real(4) small, val1
parameter(maxnnz=1000000)
double precision, dimension(maxnnz):: x
integer, dimension(maxnnz):: row_indices, col_starts
small = 1e-10
nza = 1
do i = 1, n
do j = 1, m
val1 = matrix(j, i)
if (abs(val1) < small) then
cycle
endif
col_starts(nza) = i+1
row_indices(nza) = j
x(nza) = dble(val1)
nza = nza + 1
enddo
enddo
! Avoid compiler optimizing away stuff
x(1) = x(1) + 1.0
col_starts(1) = col_starts(1) + 1
row_indices(1) = row_indices(1) + 1
end subroutine find_nnz
# Compile the fortran routine
f2py --fcompiler=gfortran --opt="-O3 -ffast-math -funroll-loops -flto -ftree-vectorize" -c fortran_test_nnz.f90 -m fortran_test_nnz
# Test Script
import os
import time
import numpy as np
from scipy.sparse import rand as srand
import cython_test_nnz
import fortran_test_nnz
SIZES = [(1661, 2608) , (2828, 3512), (780, 985) , (799, 1558), (399, 540),
(10545, 10486), (3369, 3684), (2052, 5513), (224, 628) , (553, 1475)]
NNZ = [5833, 7191, 1657, 2772, 908, 23787, 6962, 6822, 1555, 1849]
np.random.seed(123456)
def tester():
for i in range(len(NNZ)):
m, n = SIZES[i]
nnz = NNZ[i]
density = nnz/float(m*n)
matrix = srand(m, n, density=density, format='coo', dtype=np.float32)
matrix = matrix.todense(order='F')
print('%-6d %-6d %-6d'%(m, n, nnz)),
for implementation in ['Cython', 'Fortran']:
function = cython_test_nnz.find_nnz if implementation == 'Cython' else fortran_test_nnz.find_nnz
time_setup = []
for k in range(10):
start = time.clock()
function(matrix)
elapsed = time.clock() - start
time_setup.append(elapsed * 1e3)
print('%-10.4f' % np.mean(time_setup)),
print('')
if __name__ == '__main__':
tester()