AI-Driven Feature Selection in Python!

Deep-dive on ML techniques for feature selection in Python — Part 2

13 min readJul 10, 2022


Photo by Kevin Ku on Unsplash

A) Beta Coefficients

#3. Select  the top n features based on absolute value of beta coefficient of features# Beta Coefficientsbeta_threshold = 10################################ Functions #############################################################def beta_coeff(data, train_target,beta_threshold):

# data - Input feature data
# train_target - Target variable training data
# beta_threshold - select n features with highest absolute beta coeficient value

# Standardise dataset
scaler = StandardScaler()
data_v2 = pd.DataFrame(scaler.fit_transform(data))
data_v2.columns = data.columns
# Fit Logistic on Standardised dataset
# Manual Change in Parameters - Logistic Regression
# Link to function parameters -
log = LogisticRegression(fit_intercept = False, penalty = 'none'), train_target)
coef_table = pd.DataFrame(list(data_v2.columns)).copy()
coef_table.insert(len(coef_table.columns), "Coefs", log.coef_.transpose())
coef_table = coef_table.iloc[coef_table.Coefs.abs().argsort()]
sr_data2 = coef_table.tail(beta_threshold)
beta_top_features = sr_data2.iloc[:,0].tolist()

beta_top_features_df = pd.DataFrame(beta_top_features,columns = ['Feature'])
beta_top_features_df['Method'] = 'Beta_coefficients'
log_v2 = sm.Logit(train_target,\
print('Logistic Regression with selected features')

return log,log_v2,beta_top_features_df
################################ Calculate Beta Coeff ################################################standardised_logistic,logistic_beta_features,beta_top_features_df = beta_coeff(train_features_v2,train_target,beta_threshold)beta_top_features_df.head(n=20)
Image by author

B) Lasso Regression

Image by author
#4. Select the features identified by Lasso regression# Lassolasso_param = .01################################ Functions #############################################################def lasso(data, train_target,lasso_param):

# data - Input feature data
# train_target - Target variable training data
# lasso_param - Lasso l1 penalty term

#Fit Logistic
# Manual Change in Parameters - Logistic Regression
# Link to function parameters -
log = LogisticRegression(penalty ='l1', solver = 'liblinear',\
C = lasso_param), train_target)

#Select Features
lasso_df = pd.DataFrame(columns = ['Feature', 'Lasso_Coef'])
lasso_df['Feature'] = data.columns
lasso_df['Lasso_Coef'] = log.coef_.squeeze().tolist()
lasso_df_v2 = lasso_df[lasso_df['Lasso_Coef'] !=0]
lasso_top_features = lasso_df_v2['Feature'].tolist()

lasso_top_features_df = pd.DataFrame(lasso_top_features,\
columns = ['Feature'])
lasso_top_features_df['Method'] = 'Lasso'
# Logistic Regression with selected features
log_v2 = sm.Logit(train_target,\
print('Logistic Regression with selected features')

return log_v2,lasso_top_features_df
################################ Calculate Lasso ################################################logistic_lasso_features,lasso_top_features_df = lasso(train_features_v2,train_target,lasso_param)lasso_top_features_df.head(n=20)
Image by author

C) Recursive Feature Elimination (RFE)

#5. Select features based on Recursive Feature Selection method# RFECVrfe_estimator = "XGBoost"
rfe_step = 2
rfe_cv = 5
rfe_scoring = 'f1'
################################ Functions #############################################################def rfecv_feature_selection(data, train_target,rfe_estimator,rfe_step,rfe_cv,rfe_scoring):

# data - Input feature data
# train_target - Target variable training data
# rfe_estimator - base model (default: Decision Tree)
# rfe_step - number of features to remove at each iteration
# rfe_cv - cross-validation splitting strategy
# rfe_scoring - CV performance scoring metric
## Initialize RFE if rfe_estimator == "XGBoost":
# Manual Change in Parameters - XGBoost
# Link to function parameters -
estimator_rfe = XGBClassifier(n_jobs = -1, random_state=101)
elif rfe_estimator == "RandomForest":
# Manual Change in Parameters - RandomForest
# Link to function parameters -
estimator_rfe = RandomForestClassifier(n_jobs = -1, random_state=101)
elif rfe_estimator == "CatBoost":
# Manual Change in Parameters - CatBoost
# Link to function parameters -
estimator_rfe = CatBoostClassifier(iterations=50,verbose=0,random_state=101)
elif rfe_estimator == "LightGBM":
# Manual Change in Parameters - LightGBM
# Link to function parameters -
estimator_rfe = lgb.LGBMClassifier(n_jobs = -1, random_state=101)
# Manual Change in Parameters - DecisionTree
# Link to function parameters -
estimator_rfe = DecisionTreeClassifier(random_state=101)
# Manual Change in Parameters - RFECV
# Link to function parameters -
# Scoring metrics -
rfecv = RFECV(estimator = estimator_rfe, step = rfe_step, cv = rfe_cv, scoring = rfe_scoring), train_target)
# Select feature based on RFE
print('Optimal number of features: {}'.format(rfecv.n_features_))
rfe_df = pd.DataFrame(columns = ['Feature', 'rfe_filter'])
rfe_df['Feature'] = data.columns
rfe_df['rfe_filter'] = rfecv.support_.tolist()
rfe_df_v2 = rfe_df[rfe_df['rfe_filter']==True]
rfe_top_features = rfe_df_v2['Feature'].tolist()

rfe_top_features_df = pd.DataFrame(rfe_top_features,columns = ['Feature'])
rfe_top_features_df['Method'] = 'RFECV'
# Plot CV results
%matplotlib inline
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('f1 acore', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

return rfe_top_features_df,rfecv
################################ Calculate RFECV #############################################################rfe_top_features_df,rfecv = rfecv_feature_selection(train_features_v2,train_target,rfe_estimator,rfe_step,rfe_cv,rfe_scoring)
Image by author

D) Sequential Feature Selection (SFS)

#6. Select features based on Sequential Feature Selector# Sequential Feature Selectorsfs_feature = 10
sfs_direction = 'backward'
sfs_cv = 2
sfs_scoring = 'r2'
################################ Functions #############################################################def sfs_feature_selection(data, train_target,sfs_feature,sfs_direction,sfs_cv,sfs_scoring):

# data - Input feature data
# train_target - Target variable training data
# sfs_feature - no. of features to select
# sfs_direction - forward and backward selection
# sfs_cv - cross-validation splitting strategy
# sfs_scoring - CV performance scoring metric
logistic = LogisticRegression(penalty = None) sfs=SequentialFeatureSelector(estimator = logistic,
direction = sfs_direction,
cv = sfs_cv,
scoring = sfs_scoring), train_target)
sfs_df = pd.DataFrame(columns = ['Feature', 'SFS_filter'])
sfs_df['Feature'] = train_features_v2.columns
sfs_df['SFS_filter'] = sfs.get_support().tolist()
sfs_df_v2 = sfs_df[sfs_df['SFS_filter']==True]
sfs_top_features = sfs_df_v2['Feature'].tolist()
x_temp = sm.add_constant(train_features_v2[sfs_top_features]) log_v2=sm.Logit(train_target,x_temp).fit() print(log_v2.summary())

,columns = ['Feature'])
return sfs_top_features_df,sfs################################ Calculate RFECV #############################################################sfs_top_features_df,sfs = sfs_feature_selection(train_features_v2,train_target,sfs_feature,sfs_direction,sfs_cv,sfs_scoring)
Image by author
sfs=SequentialFeatureSelector(estimator = #add model here#
direction = sfs_direction,
cv = sfs_cv,
scoring = sfs_scoring)

Final Words

Reference Material

