Using H2O 3.10.4.8 on Ubuntu 14.04.4 with Python 3.6.0 (I also experience this on Python 3.5.2), if I try to fit a large GLM with only factor variables, it fails with a java.lang.ArrayIndexOutOfBoundsException. If I add a single continuous predictor to the model that fails, it runs fine.
I am able to recreate the problem with the example below:
import pandas as pd
import numpy as np
import h2o
h2o.init()
np.random.seed(1234)
x1 = np.random.randint(1,193,33000*5)
x2 = np.random.randint(1,193,33000*5)
x3 = np.random.randint(1,8,33000*5)
x4 = np.random.randint(1,8,33000*5)
x5 = np.random.random(33000*5)*10
n = np.random.randint(1,100000,33000*5)
y = np.random.random(33000*5)*n
X = pd.DataFrame({'y' : y, 'n' : n, 'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5, 'offset':np.log(n)})
X_h2o = h2o.H2OFrame(X)
X_h2o['x1'] = X_h2o['x1'].asfactor()
X_h2o['x2'] = X_h2o['x2'].asfactor()
X_h2o['x3'] = X_h2o['x3'].asfactor()
X_h2o['x4'] = X_h2o['x4'].asfactor()
## This fails with java.lang.ArrayIndexOutOfBoundsException
predictors = ['x1', 'x2', 'x3', 'x4']
ix_list = ['x1','x2']
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
poisson = H2OGeneralizedLinearEstimator(family="poisson", alpha=0.025, lambda_=0.0001, model_id='sem_poisson',\
standardize=False, interactions = ix_list, intercept=False)
poisson.train(training_frame=X_h2o, y='y', x=predictors, offset_column="offset")
## This runs without error
predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
ix_list = ['x1','x2']
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
poisson = H2OGeneralizedLinearEstimator(family="poisson", alpha=0.025, lambda_=0.0001, model_id='sem_poisson',\
standardize=False, interactions = ix_list, intercept=False)
poisson.train(training_frame=X_h2o, y='y', x=predictors, offset_column="offset")
This is the error message:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-74-3b9ec76c023b> in <module>()
9 from h2o.estimators.glm import H2OGeneralizedLinearEstimator
10 poisson = H2OGeneralizedLinearEstimator(family="poisson", alpha=0.025, lambda_=0.0001, model_id='sem_poisson', standardize=False, interactions = ix_list, intercept=False)
---> 11 poisson.train(training_frame=X_h2o, y='y', x=predictors, offset_column="offset")
/home/acorreia/anaconda3/lib/python3.6/site-packages/h2o/estimators/estimator_base.py in train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id)
202 return
203
--> 204 model.poll()
205 model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
206 self._resolve_model(model.dest_key, model_json)
/home/acorreia/anaconda3/lib/python3.6/site-packages/h2o/job.py in poll(self)
71 if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)):
72 raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: "
---> 73 "\n{}".format(self.job_key, self.exception, self.job["stacktrace"]))
74 else:
75 raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception))
OSError: Job with key $03017f00000132d4ffffffff$_ac26dc0f4f46e174f756718a6ed744a4 failed with an exception: java.lang.ArrayIndexOutOfBoundsException
stacktrace:
java.lang.ArrayIndexOutOfBoundsException
For the problem I have when I'm using my real data set (which I'm unable to share), this is the tail end of the slightly different stacktrace that I sometimes see, in addition to the one above:
---------------------------------------------------------------------------
/home/acorreia/anaconda3/lib/python3.6/site-packages/h2o/estimators/estimator_base.py in train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id)
202 return
203
--> 204 model.poll()
205 model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
206 self._resolve_model(model.dest_key, model_json)
/home/acorreia/anaconda3/lib/python3.6/site-packages/h2o/job.py in poll(self)
71 if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)):
72 raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: "
---> 73 "\n{}".format(self.job_key, self.exception, self.job["stacktrace"]))
74 else:
75 raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception))
OSError: Job with key $03017f00000132d4ffffffff$_a243f5bd946a68975109a5e8c1fc4b4c failed with an exception: java.lang.ArrayIndexOutOfBoundsException: 1
stacktrace:
java.lang.ArrayIndexOutOfBoundsException: 1
at hex.DataInfo.filterExpandedColumns(DataInfo.java:485)
at hex.glm.ComputationState.applyStrongRules(ComputationState.java:175)
at hex.glm.ComputationState.setLambda(ComputationState.java:74)
at hex.glm.GLM$GLMDriver.computeSubmodel(GLM.java:1040)
at hex.glm.GLM$GLMDriver.computeImpl(GLM.java:1117)
at hex.ModelBuilder$Driver.compute2(ModelBuilder.java:169)
at hex.glm.GLM$GLMDriver.compute2(GLM.java:542)
at water.H2O$H2OCountedCompleter.compute(H2O.java:1315)
at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:974)
at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)
import pandas as pd
import numpy as np
import h2o
h2o.init()
np.random.seed(1234)
x1 = np.random.randint(1,193,33000*5)
x2 = np.random.randint(1,193,33000*5)
x3 = np.random.randint(1,8,33000*5)
x4 = np.random.randint(1,8,33000*5)
x5 = np.random.random(33000*5)*10
n = np.random.randint(1,100000,33000*5)
y = np.random.random(33000*5)*n
X = pd.DataFrame({'y' : y, 'n' : n, 'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5, 'offset':np.log(n)})
u_ixs = X[['x1', 'x2']].drop_duplicates()
u_ixs['x1x2_combo'] = np.arange(u_ixs.shape[0])+1
X = X.merge(u_ixs)
X_h2o = h2o.H2OFrame(X)
X_h2o['x1'] = X_h2o['x1'].asfactor()
X_h2o['x2'] = X_h2o['x2'].asfactor()
X_h2o['x3'] = X_h2o['x3'].asfactor()
X_h2o['x4'] = X_h2o['x4'].asfactor()
X_h2o['x1x2_combo'] = X_h2o['x1x2_combo'].asfactor()
## No errors
predictors = ['x1', 'x2', 'x3', 'x4', 'x1x2_combo']
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
poisson = H2OGeneralizedLinearEstimator(family="poisson", alpha=0.025, lambda_=0.0001, model_id='sem_poisson',\
standardize=False, intercept=False, max_iterations=500)
I am running h2o 3.16.0.2 on python 2.7.13. I replicated Awil error example on my machine. Originally I was stepping in the same ArrayIndexOutOfBoundsException with the following setting:
model = H2OGeneralizedLinearEstimator(interactions=interactions_list, compute_p_values=True, lambda_=0,
solver='IRLSM',
remove_collinear_columns=True)
In the latter case adding a continuous variable did not help.