Hi, I'm new to Python and the pystatsmodels. I'm trying to use the MICE imputer for a project. I keep getting this error:
ValueError: array must not contain infs or NaNs
I am using a 40 * 40 matrix with a lot of missing points in it. I have tried using MICE on a subset of the variables and this was successful. I'm wondering if this could be due to the amount of columns I am using, or the amount of missing data? (many points are missing)
below is my code, the full stack and error message, any help would be much appreciated!
mi = mice.MICE("cost_A ~ PH_T + DAM + Free + Frgn + TIME + MW + Hd + GEN_n + Ht + Lt + R_VOL + CAP + C_VOL + T_KV + T_KM + PH_A + PEN_n + PEN_l + PEN_d + EXc + MT_t + Fl + Rset + GDP_AVG + GDP_CAGR + PPI + CPI + CoC + RoL + RegQ + GvmE + PolS + VcAcc + Free_PR + Free_CL + MUV_CAGR + MUV_avg + SAR_CAGR + USD_CAGR + EDU", sm.OLS, imp)
result = mi.fit(20, 5)
print(result.summary())
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py:1353: RuntimeWarning: divide by zero encountered in double_scalars
return np.dot(wresid, wresid) / self.df_resid
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-623fc0308b79> in <module>()
1 mi = mice.MICE("cost_A ~ PH_T + DAM + Free + Frgn + TIME + MW + Hd + GEN_n + Ht + Lt + R_VOL + CAP + C_VOL + T_KV + T_KM + PH_A + PEN_n + PEN_l + PEN_d + EXc + MT_t + Fl + Rset + GDP_AVG + GDP_CAGR + PPI + CPI + CoC + RoL + RegQ + GvmE + PolS + VcAcc + Free_PR + Free_CL + MUV_CAGR + MUV_avg + SAR_CAGR + USD_CAGR + EDU", sm.OLS, imp)
----> 2 result = mi.fit(20, 5)
3 print(result.summary())
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\imputation\mice.py in fit(self, n_burnin, n_imputations)
1226
1227 # Run without fitting the analysis model
-> 1228 self.data.update_all(n_burnin)
1229
1230 for j in range(n_imputations):
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\imputation\mice.py in update_all(self, n_iter)
413 for k in range(n_iter):
414 for vname in self._cycle_order:
--> 415 self.update(vname)
416
417 if self.history_callback is not None:
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\imputation\mice.py in update(self, vname)
1002 """
1003
-> 1004 self.perturb_params(vname)
1005 self.impute(vname)
1006
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\imputation\mice.py in perturb_params(self, vname)
976
977 if self.perturbation_method[vname] == "gaussian":
--> 978 self._perturb_gaussian(vname)
979 elif self.perturbation_method[vname] == "boot":
980 self._perturb_bootstrap(vname)
C:\Users\Sammy\Anaconda3\lib\site-packages\statsmodels\imputation\mice.py in _perturb_gaussian(self, vname)
970 cov = self.results[vname].cov_params()
971 mu = self.results[vname].params
--> 972 self.params[vname] = np.random.multivariate_normal(mean=mu, cov=cov)
973
974
mtrand.pyx in mtrand.RandomState.multivariate_normal (numpy\random\mtrand\mtrand.c:36911)()
C:\Users\Sammy\Anaconda3\lib\site-packages\scipy\linalg\decomp_svd.py in svd(a, full_matrices, compute_uv, overwrite_a, check_finite, lapack_driver)
94
95 """
---> 96 a1 = _asarray_validated(a, check_finite=check_finite)
97 if len(a1.shape) != 2:
98 raise ValueError('expected matrix')
C:\Users\Sammy\Anaconda3\lib\site-packages\scipy\_lib\_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
236 raise ValueError('masked arrays are not supported')
237 toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 238 a = toarray(a)
239 if not objects_ok:
240 if a.dtype is np.dtype('O'):
C:\Users\Sammy\Anaconda3\lib\site-packages\numpy\lib\function_base.py in asarray_chkfinite(a, dtype, order)
1213 if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
1214 raise ValueError(
-> 1215 "array must not contain infs or NaNs")
1216 return a
1217
ValueError: array must not contain infs or NaNs