Study/Python
파이썬 회귀분석 팁
IT파스칼
2021. 2. 4. 09:22
Scikit-learn indeed does not support stepwise regression. That's because what is commonly known as 'stepwise regression' is an algorithm based on p-values of coefficients of linear regression, and scikit-learn deliberately avoids inferential approach to model learning (significance testing etc). Moreover, pure OLS is only one of numerous regression algorithms, and from the scikit-learn point of view it is neither very important, nor one of the best.
추론적 접근 (inferential approach)
#Copyright 2019 Sinan Talha Hascelik
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Repo: https://github.com/talhahascelik/python_stepwiseSelection
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
def forwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05):
"""
Forward Selection is a function, based on regression models, that returns significant features and selection iterations.\n
Required Libraries: pandas, numpy, statmodels
Parameters
----------
X : Independent variables (Pandas Dataframe)\n
y : Dependent variable (Pandas Series, Pandas Dataframe)\n
model_type : 'linear' or 'logistic'\n
elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n
'aic' refers Akaike information criterion\n
'bic' refers Bayesian information criterion\n
'r2' refers R-squared (Only works on linear model type)\n
'r2' refers Adjusted R-squared (Only works on linear model type)\n
varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n
'drop' drops varchar features\n
'dummy' creates dummies for all levels of all varchars\n
'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n
sl : Significance Level (default: 0.05)\n
Returns
-------
columns(list), iteration_logs(str)\n\n
Not Returns a Model
Tested On
---------
Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0
See Also
--------
https://en.wikipedia.org/wiki/Stepwise_regression
"""
X = __varcharProcessing__(X,varchar_process = varchar_process)
return __forwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl)
def backwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05):
"""
Backward Selection is a function, based on regression models, that returns significant features and selection iterations.\n
Required Libraries: pandas, numpy, statmodels
Parameters
----------
X : Independent variables (Pandas Dataframe)\n
y : Dependent variable (Pandas Series, Pandas Dataframe)\n
model_type : 'linear' or 'logistic'\n
elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n
'aic' refers Akaike information criterion\n
'bic' refers Bayesian information criterion\n
'r2' refers R-squared (Only works on linear model type)\n
'r2' refers Adjusted R-squared (Only works on linear model type)\n
varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n
'drop' drops varchar features\n
'dummy' creates dummies for all levels of all varchars\n
'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n
sl : Significance Level (default: 0.05)\n
Returns
-------
columns(list), iteration_logs(str)\n\n
Not Returns a Model
Tested On
---------
Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0
See Also
--------
https://en.wikipedia.org/wiki/Stepwise_regression
"""
X = __varcharProcessing__(X,varchar_process = varchar_process)
return __backwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl)
def __varcharProcessing__(X, varchar_process = "dummy_dropfirst"):
dtypes = X.dtypes
if varchar_process == "drop":
X = X.drop(columns = dtypes[dtypes == np.object].index.tolist())
print("Character Variables (Dropped):", dtypes[dtypes == np.object].index.tolist())
elif varchar_process == "dummy":
X = pd.get_dummies(X,drop_first=False)
print("Character Variables (Dummies Generated):", dtypes[dtypes == np.object].index.tolist())
elif varchar_process == "dummy_dropfirst":
X = pd.get_dummies(X,drop_first=True)
print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())
else:
X = pd.get_dummies(X,drop_first=True)
print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())
X["intercept"] = 1
cols = X.columns.tolist()
cols = cols[-1:] + cols[:-1]
X = X[cols]
return X
def __forwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05):
iterations_log = ""
cols = X.columns.tolist()
def regressor(y,X, model_type=model_type):
if model_type == "linear":
regressor = sm.OLS(y, X).fit()
elif model_type == "logistic":
regressor = sm.Logit(y, X).fit()
else:
print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.")
model_type = "linear"
regressor = sm.OLS(y, X).fit()
return regressor
selected_cols = ["intercept"]
other_cols = cols.copy()
other_cols.remove("intercept")
model = regressor(y, X[selected_cols])
if elimination_criteria == "aic":
criteria = model.aic
elif elimination_criteria == "bic":
criteria = model.bic
elif elimination_criteria == "r2" and model_type =="linear":
criteria = model.rsquared
elif elimination_criteria == "adjr2" and model_type =="linear":
criteria = model.rsquared_adj
for i in range(X.shape[1]):
pvals = pd.DataFrame(columns = ["Cols","Pval"])
for j in other_cols:
model = regressor(y, X[selected_cols+[j]])
pvals = pvals.append(pd.DataFrame([[j, model.pvalues[j]]],columns = ["Cols","Pval"]),ignore_index=True)
pvals = pvals.sort_values(by = ["Pval"]).reset_index(drop=True)
pvals = pvals[pvals.Pval<=sl]
if pvals.shape[0] > 0:
model = regressor(y, X[selected_cols+[pvals["Cols"][0]]])
iterations_log += str("\nEntered : "+pvals["Cols"][0] + "\n")
iterations_log += "\n\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n\n"
if elimination_criteria == "aic":
new_criteria = model.aic
if new_criteria < criteria:
print("Entered :", pvals["Cols"][0], "\tAIC :", model.aic)
selected_cols.append(pvals["Cols"][0])
other_cols.remove(pvals["Cols"][0])
criteria = new_criteria
else:
print("break : Criteria")
break
elif elimination_criteria == "bic":
new_criteria = model.bic
if new_criteria < criteria:
print("Entered :", pvals["Cols"][0], "\tBIC :", model.bic)
selected_cols.append(pvals["Cols"][0])
other_cols.remove(pvals["Cols"][0])
criteria = new_criteria
else:
print("break : Criteria")
break
elif elimination_criteria == "r2" and model_type =="linear":
new_criteria = model.rsquared
if new_criteria > criteria:
print("Entered :", pvals["Cols"][0], "\tR2 :", model.rsquared)
selected_cols.append(pvals["Cols"][0])
other_cols.remove(pvals["Cols"][0])
criteria = new_criteria
else:
print("break : Criteria")
break
elif elimination_criteria == "adjr2" and model_type =="linear":
new_criteria = model.rsquared_adj
if new_criteria > criteria:
print("Entered :", pvals["Cols"][0], "\tAdjR2 :", model.rsquared_adj)
selected_cols.append(pvals["Cols"][0])
other_cols.remove(pvals["Cols"][0])
criteria = new_criteria
else:
print("Break : Criteria")
break
else:
print("Entered :", pvals["Cols"][0])
selected_cols.append(pvals["Cols"][0])
other_cols.remove(pvals["Cols"][0])
else:
print("Break : Significance Level")
break
model = regressor(y, X[selected_cols])
if elimination_criteria == "aic":
criteria = model.aic
elif elimination_criteria == "bic":
criteria = model.bic
elif elimination_criteria == "r2" and model_type =="linear":
criteria = model.rsquared
elif elimination_criteria == "adjr2" and model_type =="linear":
criteria = model.rsquared_adj
print(model.summary())
print("AIC: "+str(model.aic))
print("BIC: "+str(model.bic))
print("Final Variables:", selected_cols)
return selected_cols, iterations_log
def __backwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05):
iterations_log = ""
last_eleminated = ""
cols = X.columns.tolist()
def regressor(y,X, model_type=model_type):
if model_type =="linear":
regressor = sm.OLS(y, X).fit()
elif model_type == "logistic":
regressor = sm.Logit(y, X).fit()
else:
print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.")
model_type = "linear"
regressor = sm.OLS(y, X).fit()
return regressor
for i in range(X.shape[1]):
if i != 0 :
if elimination_criteria == "aic":
criteria = model.aic
new_model = regressor(y,X)
new_criteria = new_model.aic
if criteria < new_criteria:
print("Regained : ", last_eleminated)
iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
break
elif elimination_criteria == "bic":
criteria = model.bic
new_model = regressor(y,X)
new_criteria = new_model.bic
if criteria < new_criteria:
print("Regained : ", last_eleminated)
iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
break
elif elimination_criteria == "adjr2" and model_type =="linear":
criteria = model.rsquared_adj
new_model = regressor(y,X)
new_criteria = new_model.rsquared_adj
if criteria > new_criteria:
print("Regained : ", last_eleminated)
iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
break
elif elimination_criteria == "r2" and model_type =="linear":
criteria = model.rsquared
new_model = regressor(y,X)
new_criteria = new_model.rsquared
if criteria > new_criteria:
print("Regained : ", last_eleminated)
iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
break
else:
new_model = regressor(y,X)
model = new_model
iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
else:
model = regressor(y,X)
iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
maxPval = max(model.pvalues)
cols = X.columns.tolist()
if maxPval > sl:
for j in cols:
if (model.pvalues[j] == maxPval):
print("Eliminated :" ,j)
iterations_log += str("\n\nEliminated : "+j+ "\n\n")
del X[j]
last_eleminated = j
else:
break
print(str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic))
print("Final Variables:", cols)
iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
return cols, iterations_log
참고
datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn