TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial…

Follow publication

AI-Driven Feature Selection in Python!

Deep-dive on ML techniques for feature selection in Python - Part 1

Indraneel Dutta Baruah
TDS Archive
Published in
10 min readJul 10, 2022

--

Photo by Edu Grande on Unsplash

“Garbage in, garbage out!”

Blog Series Sections

A) Types of Feature Selection Methods

Image by author

B) Correlation: Pearson, Point Bi-Serial, Cramer’s V

Image by author
Image by author
Image by author
Image by author
#1.Select the top n features based on absolute correlation with train_target variable# Correlationpearson_list = []
point_bi_serial_list = ['LIMIT_BAL', 'AGE', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3','PAY_AMT4',
'PAY_AMT5', 'PAY_AMT6']
cramer_list = ['SEX_woe', 'EDUCATION_woe',
'MARRIAGE_woe', 'PAY_0_woe',
'PAY_2_woe', 'PAY_3_woe', 'PAY_4_woe',
'PAY_5_woe', 'PAY_6_woe']
pearson_threshold = .5
point_bi_serial_threshold = .5
cramer_threshold = .1
################################ Functions ############################################################## Function to calculate Cramer's V
def cramers_V(var1,var2) :
crosstab=np.array(pd.crosstab(var1,var2,
rownames=None, colnames=None))
stat = chi2_contingency(crosstab)[0]
obs = np.sum(crosstab)
mini = min(crosstab.shape)-1
return (stat/(obs*mini))
# Overall Correlation Function
def corr_feature_selection(data,target,pearson_list,
point_bi_serial_list,cramer_list,
pearson_threshold,
point_bi_serial_threshold,
cramer_threshold):

#Inputs
# data - Input feature data
# target - Target Variable
# pearson_list - list of continuous features (if target is continuous)
# point_bi_serial_list - list of continuous features (if target is categorical)/
# list of categorical features (if target is continuous)
# cramer_list - list of categorical features (if target is categorical)
# pearson_threshold - select features if pearson corrrelation is above this
# point_bi_serial_threshold - select features if biserial corrrelation is above this
# cramer_threshold - select features if cramer's v is above this

corr_data = pd.DataFrame()
# Calculate point bi-serial
for i in point_bi_serial_list:
# Manual Change in Parameters - Point Bi-Serial
# Link to function parameters - https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pointbiserialr.html
pbc = pointbiserialr(target, data[i])
corr_temp_data = [[i,pbc.correlation,"point_bi_serial"]]
corr_temp_df = pd.DataFrame(corr_temp_data,
columns = ['Feature',
'Correlation',
'Correlation_Type'])
corr_data = corr_data.append(corr_temp_df)
# Calculate cramer's v
for i in cramer_list:
cramer = cramers_V(target, data[i])
corr_temp_data = [[i,cramer,"cramer_v"]]
corr_temp_df = pd.DataFrame(corr_temp_data,
columns = ['Feature',
'Correlation',
'Correlation_Type'])
corr_data = corr_data.append(corr_temp_df)
# Calculate pearson correlation
for i in pearson_list:
# Manual Change in Parameters - Perason
# Link to function parameters - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html
pearson = target.corr(data[i])
corr_temp_data = [[i,pearson,"pearson"]]
corr_temp_df = pd.DataFrame(corr_temp_data,
columns = ['Feature',
'Correlation',
'Correlation_Type'])
corr_data = corr_data.append(corr_temp_df)
# Filter NA and sort based on absolute correlation
corr_data = corr_data.iloc[corr_data.Correlation.abs().argsort()]
corr_data = corr_data[corr_data['Correlation'].notna()]
corr_data = corr_data.loc[corr_data['Correlation'] != 1]

# Add thresholds

# initialize list of lists
data = [['pearson', pearson_threshold],
['point_bi_serial', point_bi_serial_threshold],
['cramer_v', cramer_threshold]]
threshold_df = pd.DataFrame(data,
columns=['Correlation_Type',
'Threshold'])
corr_data = pd.merge(corr_data,threshold_df,
on=['Correlation_Type'],how = 'left')
# Select Features with greater than user dfined absolute correlation
corr_data2 = corr_data.loc[corr_data['Correlation'].abs() > corr_data['Threshold']]
corr_top_features = corr_data2['Feature'].tolist()
print(corr_top_features)
corr_top_features_df = pd.DataFrame(corr_top_features,columns = ['Feature'])
corr_top_features_df['Method'] = 'Correlation'
return corr_data,corr_top_features_df
################################ Calculate Correlation #############################################################corr_data,corr_top_features_df = corr_feature_selection(train_features_v2,train_target,
pearson_list,point_bi_serial_list,
cramer_list,pearson_threshold,
point_bi_serial_threshold,cramer_threshold)


corr_data.tail(30)
Image by author

C) Weight of Evidence (WOE) and Information Value (IV)

Image by author
Image by author
#2. Select top features based on information value# Information valueshow_woe = True
iv_bins = 10
iv_threshold = .1
################################ Functions #############################################################def iv_woe(data, target, iv_bins,iv_threshold, show_woe):

#Inputs
# data - Input Data including target variable
# target - Target Variable name
# iv_bins - Number of iv_bins
# show_woe - show all the iv_bins and features
# iv_threshold - select features with IV greater than this

#Empty Dataframe
newDF,woeDF = pd.DataFrame(), pd.DataFrame()

#Extract Column Names
cols = data.columns

#Run WOE and IV on all the independent variables
for ivars in cols[~cols.isin([target])]:
if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
binned_x = pd.qcut(data[ivars], iv_bins, duplicates='drop')
d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
else:
d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
# Calculate the number of events in each group (bin)
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Cutoff', 'N', 'Events']

# Calculate % of events in each group.
d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
# Calculate the non events in each group.
d['Non-Events'] = d['N'] - d['Events']
# Calculate % of non events in each group.
d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
# Calculate WOE by taking natural log of division of %
# of non-events and % of events
d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
d.insert(loc=0, column='Variable', value=ivars)
print("Information value of " + ivars + " is " +
str(round(d['IV'].sum(),6)))
temp =pd.DataFrame({"Variable" : [ivars],
"IV" : [d['IV'].sum()]},
columns = ["Variable", "IV"])
newDF=pd.concat([newDF,temp], axis=0)
woeDF=pd.concat([woeDF,d], axis=0)
#Show WOE Table
if show_woe == True:
print(d)

# Aggregate IV at feature level
woeDF_v2 = pd.DataFrame(woeDF.groupby('Variable')['IV'].agg('sum'),
columns= ['IV']).reset_index()
woeDF_v3 = woeDF_v2.sort_values(['IV'], ascending = False)
IV_df = woeDF_v2[woeDF_v2['IV']> iv_threshold]
woe_top_features = IV_df['Variable'].tolist()
print(woe_top_features)
woe_top_features_df = pd.DataFrame(woe_top_features,columns = ['Feature'])
woe_top_features_df['Method'] = 'Information_value'
return newDF, woeDF,IV_df, woe_top_features_df
################################ Calculate IV #############################################################train_features_v3_temp = pd.concat([train_target, train_features_v2],
axis =1)
newDF, woeDF,IV_df, woe_top_features_df = iv_woe(train_features_v3_temp,
target,iv_bins,iv_threshold,
show_woe)
woeDF.head(n=50)
Image by author
Image by author

Final Words

Reference Material

Let’s Connect!

Photo by Pete Pedroza on Unsplash

Free

Distraction-free reading. No ads.

Organize your knowledge with lists and highlights.

Tell your story. Find your audience.

Membership

Read member-only stories

Support writers you read most

Earn money for your writing

Listen to audio narrations

Read offline with the Medium app

--

--

TDS Archive
TDS Archive

Published in TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Indraneel Dutta Baruah
Indraneel Dutta Baruah

Written by Indraneel Dutta Baruah

Striving for excellence in solving business problems using AI!

No responses yet

Write a response