TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial…

Follow publication

AI-Driven Feature Selection in Python!

Deep-dive on ML techniques for feature selection in Python — Part 2

Indraneel Dutta Baruah
TDS Archive
Published in
13 min readJul 10, 2022

--

Photo by Kevin Ku on Unsplash

A) Beta Coefficients

#3. Select  the top n features based on absolute value of beta coefficient of features# Beta Coefficientsbeta_threshold = 10################################ Functions #############################################################def beta_coeff(data, train_target,beta_threshold):

#Inputs
# data - Input feature data
# train_target - Target variable training data
# beta_threshold - select n features with highest absolute beta coeficient value

# Standardise dataset
scaler = StandardScaler()
data_v2 = pd.DataFrame(scaler.fit_transform(data))
data_v2.columns = data.columns
# Fit Logistic on Standardised dataset
# Manual Change in Parameters - Logistic Regression
# Link to function parameters - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
log = LogisticRegression(fit_intercept = False, penalty = 'none')
log.fit(data_v2, train_target)
coef_table = pd.DataFrame(list(data_v2.columns)).copy()
coef_table.insert(len(coef_table.columns), "Coefs", log.coef_.transpose())
coef_table = coef_table.iloc[coef_table.Coefs.abs().argsort()]
sr_data2 = coef_table.tail(beta_threshold)
beta_top_features = sr_data2.iloc[:,0].tolist()
print(beta_top_features)

beta_top_features_df = pd.DataFrame(beta_top_features,columns = ['Feature'])
beta_top_features_df['Method'] = 'Beta_coefficients'
log_v2 = sm.Logit(train_target,\
sm.add_constant(data[beta_top_features])).fit()
print('Logistic Regression with selected features')
print(log_v2.summary())

return log,log_v2,beta_top_features_df
################################ Calculate Beta Coeff ################################################standardised_logistic,logistic_beta_features,beta_top_features_df = beta_coeff(train_features_v2,train_target,beta_threshold)beta_top_features_df.head(n=20)
Image by author

B) Lasso Regression

Image by author
#4. Select the features identified by Lasso regression# Lassolasso_param = .01################################ Functions #############################################################def lasso(data, train_target,lasso_param):

#Inputs
# data - Input feature data
# train_target - Target variable training data
# lasso_param - Lasso l1 penalty term

#Fit Logistic
# Manual Change in Parameters - Logistic Regression
# Link to function parameters - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
log = LogisticRegression(penalty ='l1', solver = 'liblinear',\
C = lasso_param)
log.fit(data, train_target)

#Select Features
lasso_df = pd.DataFrame(columns = ['Feature', 'Lasso_Coef'])
lasso_df['Feature'] = data.columns
lasso_df['Lasso_Coef'] = log.coef_.squeeze().tolist()
lasso_df_v2 = lasso_df[lasso_df['Lasso_Coef'] !=0]
lasso_top_features = lasso_df_v2['Feature'].tolist()

lasso_top_features_df = pd.DataFrame(lasso_top_features,\
columns = ['Feature'])
lasso_top_features_df['Method'] = 'Lasso'
# Logistic Regression with selected features
log_v2 = sm.Logit(train_target,\
sm.add_constant(data[lasso_top_features])).fit()
print('Logistic Regression with selected features')
print(log_v2.summary())

return log_v2,lasso_top_features_df
################################ Calculate Lasso ################################################logistic_lasso_features,lasso_top_features_df = lasso(train_features_v2,train_target,lasso_param)lasso_top_features_df.head(n=20)
Image by author

C) Recursive Feature Elimination (RFE)

#5. Select features based on Recursive Feature Selection method# RFECVrfe_estimator = "XGBoost"
rfe_step = 2
rfe_cv = 5
rfe_scoring = 'f1'
################################ Functions #############################################################def rfecv_feature_selection(data, train_target,rfe_estimator,rfe_step,rfe_cv,rfe_scoring):

#Inputs
# data - Input feature data
# train_target - Target variable training data
# rfe_estimator - base model (default: Decision Tree)
# rfe_step - number of features to remove at each iteration
# rfe_cv - cross-validation splitting strategy
# rfe_scoring - CV performance scoring metric
## Initialize RFE if rfe_estimator == "XGBoost":
# Manual Change in Parameters - XGBoost
# Link to function parameters - https://xgboost.readthedocs.io/en/stable/parameter.html
estimator_rfe = XGBClassifier(n_jobs = -1, random_state=101)
elif rfe_estimator == "RandomForest":
# Manual Change in Parameters - RandomForest
# Link to function parameters - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
estimator_rfe = RandomForestClassifier(n_jobs = -1, random_state=101)
elif rfe_estimator == "CatBoost":
# Manual Change in Parameters - CatBoost
# Link to function parameters - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier
estimator_rfe = CatBoostClassifier(iterations=50,verbose=0,random_state=101)
elif rfe_estimator == "LightGBM":
# Manual Change in Parameters - LightGBM
# Link to function parameters - https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
estimator_rfe = lgb.LGBMClassifier(n_jobs = -1, random_state=101)
else:
# Manual Change in Parameters - DecisionTree
# Link to function parameters - https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
estimator_rfe = DecisionTreeClassifier(random_state=101)
# Fit RFECV
# Manual Change in Parameters - RFECV
# Link to function parameters - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
# Scoring metrics - https://scikit-learn.org/stable/modules/model_evaluation.html
rfecv = RFECV(estimator = estimator_rfe, step = rfe_step, cv = rfe_cv, scoring = rfe_scoring)
rfecv.fit(data, train_target)
# Select feature based on RFE
print('Optimal number of features: {}'.format(rfecv.n_features_))
rfe_df = pd.DataFrame(columns = ['Feature', 'rfe_filter'])
rfe_df['Feature'] = data.columns
rfe_df['rfe_filter'] = rfecv.support_.tolist()
rfe_df_v2 = rfe_df[rfe_df['rfe_filter']==True]
rfe_top_features = rfe_df_v2['Feature'].tolist()
print(rfe_top_features)

rfe_top_features_df = pd.DataFrame(rfe_top_features,columns = ['Feature'])
rfe_top_features_df['Method'] = 'RFECV'
# Plot CV results
%matplotlib inline
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('f1 acore', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

return rfe_top_features_df,rfecv
################################ Calculate RFECV #############################################################rfe_top_features_df,rfecv = rfecv_feature_selection(train_features_v2,train_target,rfe_estimator,rfe_step,rfe_cv,rfe_scoring)
rfe_top_features_df.head(n=20)
Image by author

D) Sequential Feature Selection (SFS)

#6. Select features based on Sequential Feature Selector# Sequential Feature Selectorsfs_feature = 10
sfs_direction = 'backward'
sfs_cv = 2
sfs_scoring = 'r2'
################################ Functions #############################################################def sfs_feature_selection(data, train_target,sfs_feature,sfs_direction,sfs_cv,sfs_scoring):

#Inputs
# data - Input feature data
# train_target - Target variable training data
# sfs_feature - no. of features to select
# sfs_direction - forward and backward selection
# sfs_cv - cross-validation splitting strategy
# sfs_scoring - CV performance scoring metric
logistic = LogisticRegression(penalty = None) sfs=SequentialFeatureSelector(estimator = logistic,
n_features_to_select=sfs_feature,
direction = sfs_direction,
cv = sfs_cv,
scoring = sfs_scoring)
sfs.fit(train_features_v2, train_target)
sfs.get_support()
sfs_df = pd.DataFrame(columns = ['Feature', 'SFS_filter'])
sfs_df['Feature'] = train_features_v2.columns
sfs_df['SFS_filter'] = sfs.get_support().tolist()
sfs_df_v2 = sfs_df[sfs_df['SFS_filter']==True]
sfs_top_features = sfs_df_v2['Feature'].tolist()
print(sfs_top_features)
x_temp = sm.add_constant(train_features_v2[sfs_top_features]) log_v2=sm.Logit(train_target,x_temp).fit() print(log_v2.summary())

sfs_top_features_df=pd.DataFrame(sfs_top_features\
,columns = ['Feature'])
sfs_top_features_df['Method']='Sequential_feature_selector'
return sfs_top_features_df,sfs################################ Calculate RFECV #############################################################sfs_top_features_df,sfs = sfs_feature_selection(train_features_v2,train_target,sfs_feature,sfs_direction,sfs_cv,sfs_scoring)
sfs_top_features_df.head(n=20)
Image by author
sfs=SequentialFeatureSelector(estimator = #add model here#
,n_features_to_select=sfs_feature,
direction = sfs_direction,
cv = sfs_cv,
scoring = sfs_scoring)

Final Words

Reference Material

Let’s Connect!

Photo by Howie R on Unsplash

--

--

TDS Archive
TDS Archive

Published in TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Indraneel Dutta Baruah
Indraneel Dutta Baruah

Written by Indraneel Dutta Baruah

Striving for excellence in solving business problems using AI!

No responses yet

Write a response