Partial sucess in using GPU on Windows

Jan Chorowski

unread,

Jun 22, 2011, 1:02:56 PM6/22/11

to theano-dev

Hi All,

I got a little further in getting the compile options for Windows.
Unfortunately I'm only learning to use Theano and I am not able to
decide how well (or how bad) it works right now. Toy programs, as the
one in the documentation work.

The steps are:
1. instead of setting the compiler location in .theanorc, one needs to
execute vcvars32.bat from the bin directory of the visual studio
installation, this will set the environment for VS.
2. instead of supplying the cuda_ndarray.pyd on the command line, we
need to link against the library, -lcuda_ndarray
3. all functions exported from cuda_ndarray need to be prefixed with
__declspec(dllexport) in the header file to be visible to the linker

I hope it will help.

Please see the diff file at the end of the post.

Jan Chorowski

diff against the current hg repository:

diff -r 1dad8c06719d theano/gof/compiledir.py
--- a/theano/gof/compiledir.py Mon Jun 20 09:45:52 2011 -0400
+++ b/theano/gof/compiledir.py Wed Jun 22 09:00:14 2011 -0700
@@ -11,7 +11,7 @@
platform.platform(),
platform.processor(),
platform.python_version()])
- platform_id = re.sub("[\(\)\s]+", "_", platform_id)
+ platform_id = re.sub("[\(\)\s,]+", "_", platform_id)
return 'compiledir_' + platform_id

diff -r 1dad8c06719d theano/sandbox/cuda/cuda_ndarray.cuh
--- a/theano/sandbox/cuda/cuda_ndarray.cuh Mon Jun 20 09:45:52
2011 -0400
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh Wed Jun 22 09:00:14
2011 -0700
@@ -6,6 +6,12 @@

#include <cublas.h>

+#ifdef _WIN32
+#define DllExport __declspec( dllexport )
+#else
+#define DllExport
+#endif
+
typedef float real;
#define REAL_TYPENUM 11

@@ -36,8 +42,8 @@
* device_malloc will set the Python error message before returning
None.
* device_free will return nonzero on failure (after setting the
python error message)
*/
-void * device_malloc(size_t size);
-int device_free(void * ptr);
+DllExport void * device_malloc(size_t size);
+DllExport int device_free(void * ptr);

template <typename T>
static T ceil_intdiv(T a, T b)
@@ -80,25 +86,25 @@
/*
* Return a CudaNdarray whose 'nd' dimensions are all 0.
*/
-PyObject *
+DllExport PyObject *
CudaNdarray_New(int nd=-1);

/**
* Return 1 for a CudaNdarray otw 0
*/
-int
+DllExport int
CudaNdarray_Check(const PyObject * ob);

/**
* Return 1 for a CudaNdarray otw 0
*/
-int
+DllExport int
CudaNdarray_CheckExact(const PyObject * ob);

/**
* Return true for a C-contiguous CudaNdarray, else false
*/
-bool
+DllExport bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self);

/****
@@ -307,14 +313,14 @@
*
* No Storage space is allocated (and all dimensions are 0)
*/
-PyObject * CudaNdarray_new_nd(const int nd);
+DllExport PyObject * CudaNdarray_new_nd(const int nd);

/**
* [Re]allocate a CudaNdarray with access to 'nd' dimensions.
*
* Note: This does not allocate storage for data.
*/
-int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
+DllExport int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
@@ -455,7 +461,7 @@
*
* Set self to be a view of given `data`, owned by existing
CudaNdarray `base`.
*/
-int CudaNdarray_set_device_data(CudaNdarray * self, float * data,
PyObject * base);
+DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float *
data, PyObject * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data,
CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *)
base);
@@ -464,55 +470,55 @@
/**
* Return an independent copy of self
*/
-PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
+DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self,
PyObject * memo);

/**
* Return an independent copy of self
*/
-PyObject * CudaNdarray_Copy(CudaNdarray * self);
+DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);

/**
* Return a new object obtained by summing over the dimensions for
which there is a 1 in the mask.
*/
-PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject *
py_reduce_mask);
+DllExport PyObject * CudaNdarray_ReduceSum(CudaNdarray * self,
PyObject * py_reduce_mask);

/**
* Transfer the contents of numpy array `obj` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
*/
-int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
+DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self,
PyArrayObject*obj);

/**
* Transfer the contents of CudaNdarray `other` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
*/
-int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray *
other, bool unbroadcast = false);
+DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
CudaNdarray * other, bool unbroadcast = false);

/**
* Transfer the contents of CudaNdarray `self` to a new numpy
ndarray.
*/
-PyObject *
+DllExport PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self);

-PyObject *
+DllExport PyObject *
CudaNdarray_ZEROS(int n, int * dims);

/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... ,
dim[0], 1]
*/
-bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
+DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);

-int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const
CudaNdarray * B, float beta, CudaNdarray * C);
-int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y,
CudaNdarray* A);
+DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A,
const CudaNdarray * B, float beta, CudaNdarray * C);
+DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x,
CudaNdarray * y, CudaNdarray* A);

-int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
-int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
-int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
-int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
+DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray
* A);
+DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray
* A);
+DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray
* A);
+DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray
* A);

-int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len,
const int * pattern);
+DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int
len, const int * pattern);

void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
diff -r 1dad8c06719d theano/sandbox/cuda/nvcc_compiler.py
--- a/theano/sandbox/cuda/nvcc_compiler.py Mon Jun 20 09:45:52
2011 -0400
+++ b/theano/sandbox/cuda/nvcc_compiler.py Wed Jun 22 09:00:14
2011 -0700
@@ -181,7 +181,10 @@
cmd.extend(['-o',lib_filename])
cmd.append(os.path.split(cppfilename)[-1])
if module_name != 'cuda_ndarray':
- cmd.append(os.path.join(os.path.split(cppfilename)
[0],'..','cuda_ndarray','cuda_ndarray.'+get_lib_extension()))
+ if sys.platform != 'win32':
+ cmd.append(os.path.join(os.path.split(cppfilename)
[0],'..','cuda_ndarray','cuda_ndarray.'+get_lib_extension()))
+ else:
+ cmd.append('-lcuda_ndarray')
cmd.extend(['-L%s'%ldir for ldir in lib_dirs])
cmd.extend(['-l%s'%l for l in libs])
if sys.platform == 'darwin':

Olivier Delalleau

unread,

Jun 22, 2011, 1:18:47 PM6/22/11

to thean...@googlegroups.com

Sweet, thanks! I'll look into it when I can find some time for it... (probably next week).

-=- Olivier

2011/6/22 Jan Chorowski <jan.ch...@gmail.com>

Eelco

unread,

Jun 28, 2011, 3:29:07 PM6/28/11

to theano-dev

I dont know enough about the internals to know whats going on here,
but I am very much hoping for GPU support under windows to come our
way, so keep up the good work!

Regards,
Eelco

Reply all

Reply to author

Forward