Single Column using ColumnSelector

136 views
Skip to first unread message

Craig J

unread,
Feb 19, 2018, 3:28:12 PM2/19/18
to mlxtend
I'm working with some text data.  I plan on using some TFIDF as well as some contextual information from the text such as the number of capital letters, length of the message, use of exclamation points, etc.  I'd like to use a stacking classifier to combine the predictions.  I'll use one pipeline to make predictions based on the text content (TFIDF) and the second set of predictions based on the text context (caps, exclamation points, etc).  The problem I'm running into is when I use a single column in the column selector to just grab the comments and run them through my pipeline, I receive an error (AttributeError: 'numpy.ndarray' object has no attribute 'lower'). If I change X so there is only one column and remove the column selector the pipe works properly. If I use two columns in a random forest classifier it works correctly so, as best I can tell it has to do something with the column selector but I'm not sure what. Any suggestions?

# Data
X = data.values
y = data['toxic'].values

# Create dataset 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42,
                                                    stratify = y
                                                   )

# Dataset columns
[(0, 'id'),
 (1, 'comment_text'),
 (2, 'toxic'),
 (3, 'severe_toxic'),
 (4, 'obscene'),
 (5, 'threat'),
 (6, 'insult'),
 (7, 'identity_hate'),
 (8, 'message_length'),
 (9, 'caps'),
 (10, 'elipsis'),
 (11, 'exclamation'),
 (12, 'commas'),
 (13, 'comment_clean')]

# Create pipelines
pipe1 = make_pipeline(ColumnSelector(cols=(13,)),
                      CountVectorizer(), 
                      DenseTransformer(),
                      MultinomialNB()
                     )

pipe2 = make_pipeline(ColumnSelector(cols=(2,3,4,5,6,7,8,9,10,11,12)),
                      RandomForestClassifier()
                     )

# Stack
sclf = StackingClassifier(classifiers=[pipe1, pipe2], 
                          meta_classifier=LogisticRegression()
                         )

sclf.fit(X_train, y_train)



Error 

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-230-c98b763c8ccf> in <module>()
      3                          )
      4 
----> 5 sclf.fit(X_train, y_train)

~/anaconda2/envs/sb/lib/python3.6/site-packages/mlxtend/classifier/stacking_classification.py in fit(self, X, y)
    124                 print(_name_estimators((clf,))[0][1])
    125 
--> 126             clf.fit(X, y)
    127 
    128         meta_features = self.predict_meta_features(X)

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    246             This estimator
    247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
    249         if self._final_estimator is not None:
    250             self._final_estimator.fit(Xt, y, **fit_params)

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
    867 
    868         vocabulary, X = self._count_vocab(raw_documents,
--> 869                                           self.fixed_vocabulary_)
    870 
    871         if self.binary:

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
    790         for doc in raw_documents:
    791             feature_counter = {}
--> 792             for feature in analyze(doc):
    793                 try:
    794                     feature_idx = vocabulary[feature]

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
    264 
    265             return lambda doc: self._word_ngrams(
--> 266                 tokenize(preprocess(self.decode(doc))), stop_words)
    267 
    268         else:

~/anaconda2/envs/sb/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
    230 
    231         if self.lowercase:
--> 232             return lambda x: strip_accents(x.lower())
    233         else:
    234             return strip_accents

AttributeError: 'numpy.ndarray' object has no attribute 'lower'


Sebastian Raschka

unread,
Feb 19, 2018, 4:31:43 PM2/19/18
to Craig J, mlxtend
Hi, Craig,

my first thought was that the column selector might mistakenly return an array with only 1 axis when there's only one column selected, but it looks fine to me:


from mlxtend.feature_selection import ColumnSelector
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

col_selector = ColumnSelector(cols=(0, ))
col_selector.transform(X).shape
(150, 1)

Could you maybe try to run

pipe1.fit(X_train, y_train)

to see if that's a general issue with the column selector or whether that's something related to the stacking classifier?

Best,
Sebastian
> --
> You received this message because you are subscribed to the Google Groups "mlxtend" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to mlxtend+u...@googlegroups.com.
> To post to this group, send email to mlx...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/mlxtend/47019e7c-a871-4d3c-8852-ce840cdf7c87%40googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

Craig J

unread,
Feb 19, 2018, 5:32:09 PM2/19/18
to mlxtend
I tried to fit the pipe and it results in the same error.  Below is a toy example that throws the error I'm seeing.

import pandas as pd
from mlxtend.feature_selection import ColumnSelector
from mlxtend.preprocessing import DenseTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


data = pd.DataFrame([{'outcome':1, 'txt':"Sample text!!!"}])
data['exclamation'] = data['txt'].apply(lambda x: x.count('!'))

X = data[['txt', 'exclamation']].values
y = data['outcome'].values

pipe1 = make_pipeline(ColumnSelector(cols=(1,)),
                      CountVectorizer(ngram_range=(1,3)), 
                      DenseTransformer(),
                      TfidfTransformer(),
                      DenseTransformer(),
                      MultinomialNB()
                     )

pipe1.fit(X, y)



Sebastian Raschka

unread,
Feb 19, 2018, 5:44:09 PM2/19/18
to Craig J, mlxtend
Oh, I think I know what's going on. The CountVectorizer might expect a 1D array, not a 2D array:

E.g., the following would cause the same issue that you got:

X = np.array(['abc abc abc abd', 'def def def def def sef'])[:, np.newaxis]
y = np.array([0, 1])

pipe1 = make_pipeline(CountVectorizer(ngram_range=(1,3)),
DenseTransformer(),
TfidfTransformer(),
DenseTransformer(),
MultinomialNB())

but if you remove the [:, np.newaxis] part, it should work.

I think it would be good to add an "drop_axis" attribute to the ColumnSelector that, if true, would remove the last axis of the X array. It should be a quick mod, I can take a look at it now.

Best,
Sebastian
> --
> You received this message because you are subscribed to the Google Groups "mlxtend" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to mlxtend+u...@googlegroups.com.
> To post to this group, send email to mlx...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/mlxtend/f5b972ff-729a-4d88-b1cd-e8906e9aa422%40googlegroups.com.

Sebastian Raschka

unread,
Feb 19, 2018, 7:19:43 PM2/19/18
to Sebastian Raschka, Craig J, mlxtend
I added an option to the master branch now. Can you install it

pip install git+git://github.com/rasbt/mlxtend.git


and try


import pandas as pd
from mlxtend.feature_selection import ColumnSelector
from mlxtend.preprocessing import DenseTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


data = pd.DataFrame([{'outcome':1, 'txt':"Sample text!!!"}])
data['exclamation'] = data['txt'].apply(lambda x: x.count('!'))

X = data[['txt', 'exclamation']].values
y = data['outcome'].values

pipe1 = make_pipeline(ColumnSelector(cols=0, drop_axis=True),
CountVectorizer(ngram_range=(1,3)),
DenseTransformer(),
TfidfTransformer(),
DenseTransformer(),
MultinomialNB()
)

pipe1.fit(X, y)





> pipe1 = make_pipeline(CountVectorizer(ngram_range=(1,3), ),
> DenseTransformer(),
> TfidfTransformer(),
> DenseTransformer(),
> MultinomialNB())
> To view this discussion on the web visit https://groups.google.com/d/msgid/mlxtend/D8528E4F-417F-4CAD-81A4-F1D8146C97F6%40gmail.com.

Craig J

unread,
Feb 19, 2018, 9:10:03 PM2/19/18
to mlxtend
Hi Sebastian, 

I was able to get it installed and it's no longer throwing the error and appears to be working.  Thanks a bunch for making that change and hopefully others will find it as helpful as me!

Sebastian Raschka

unread,
Feb 19, 2018, 9:25:10 PM2/19/18
to Craig J, mlxtend
That's great, I am glad to hear that it's working now!

> On Feb 19, 2018, at 9:10 PM, Craig J <cjoh...@gmail.com> wrote:
>
> Hi Sebastian,
>
> I was able to get it installed and it's no longer throwing the error and appears to be working. Thanks a bunch for making that change and hopefully others will find it as helpful as me!
>
>
> --
> You received this message because you are subscribed to the Google Groups "mlxtend" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to mlxtend+u...@googlegroups.com.
> To post to this group, send email to mlx...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/mlxtend/27f7a7e9-9fa6-4ded-98f2-57def49c1e50%40googlegroups.com.
Reply all
Reply to author
Forward
0 new messages