Skip to content Skip to sidebar Skip to footer

How To Add A Feature To A Vectorized Data Set?

I want to write a Naive Base text classificator. Because sklearn does not accept 'text form' features I am transforming them using TfidfVectorizer. I was successfully able to cre

Solution 1:

Here is a demo, showing how to union features and how to tune up hyperparameters using GridSearchCV.

Unfortunately your sample data set is too tiny to train a real model...

try:
    from pathlib import Path
except ImportError:             # Python 2from pathlib2 import Path
import os
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack


classColumnSelector(BaseEstimator, TransformerMixin):

    def__init__(self, name=None, position=None,
                 as_cat_codes=False, sparse=False):
        self.name = name
        self.position = position
        self.as_cat_codes = as_cat_codes
        self.sparse = sparse

    deffit(self, X, y=None):
        return self

    deftransform(self, X, **kwargs):
        if self.name isnotNone:
            col_pos = X.columns.get_loc(self.name)
        elif self.position isnotNone:
            col_pos = self.position
        else:
            raise Exception('either [name] or [position] parameter must be not-None')
        if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
                ret = X.iloc[:, col_pos].cat.codes
        else:
            ret = X.iloc[:, col_pos]
        if self.sparse:
            ret = csr_matrix(ret.values.reshape(-1,1))
        return ret

union = FeatureUnion([
            ('text', 
             Pipeline([
                ('select', ColumnSelector('url')),
                #('pct', SelectPercentile(percentile=1)),
                ('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                         stop_words='english')),
             ]) ),
            ('ads',
             Pipeline([
                ('select', ColumnSelector('ads_keyword', sparse=True,
                                          as_cat_codes=True)),
                #('scale', StandardScaler(with_mean=False)),
             ]) )
        ])

pipe = Pipeline([
    ('union', union),
    ('clf', MultinomialNB())
])

param_grid = [
    {
        'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
                                              max_df=0.5,
                                              stop_words='english')],
        'clf': [SGDClassifier(max_iter=500)],
        'union__text__vect__ngram_range': [(1,1), (2,5)],
        'union__text__vect__analyzer': ['word','char_wb'],
        'clf__alpha': np.logspace(-5, 0, 6),
        #'clf__max_iter': [500],
    },
    {
        'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
                                              max_df=0.5,
                                              stop_words='english')],
        'clf': [MultinomialNB()],
        'union__text__vect__ngram_range': [(1,1), (2,5)],
        'union__text__vect__analyzer': ['word','char_wb'],
        'clf__alpha': np.logspace(-4, 2, 7),
    },
    #{        # NOTE: does NOT support sparse matrices!#    'union__text__vect': [TfidfVectorizer(sublinear_tf=True,#                                          max_df=0.5,#                                          stop_words='english')],#    'clf': [GaussianNB()],#    'union__text__vect__ngram_range': [(1,1), (2,5)],#    'union__text__vect__analyzer': ['word','char_wb'],#},
]

gs_kwargs = dict(scoring='roc_auc', cv=3, n_jobs=1, verbose=2)
X_train, X_test, y_train, y_test = \
    train_test_split(df[['url','ads_keyword']], df['target'], test_size=0.33)
grid = GridSearchCV(pipe, param_grid=param_grid, **gs_kwargs)
grid.fit(X_train, y_train)

# prediction
predicted = grid.predict(X_test)

Post a Comment for "How To Add A Feature To A Vectorized Data Set?"