End to end Logistic Regression in R

Logistic regression, or logit regression is a regression model where the dependent variable is categorical. I have provided code below to perform end-to-end logistic regression in R including data preprocessing, training and evaluation. The dataset used can be downloaded from here.

# Topics Covered
#
# 1. Reading data and Summary Statistics
# 2. Outlier Detection
# 3. Missing Value Treatment
# 4. Correlation and VIF
# 5. Feature Selection Using IV
# 6. Creating Training and validation Sets
# 7. Running the Logistic Model on Training Set
# 8. Evaluating Performance on Validation Set
# a. ROC and AUC
# b. Confusion Matrix
# c. KS Statistic
# 9. Scoring the test data

##############################################################################
#Reading data and Summary Statistics

#change the working directory
setwd(&quot;C:\\Desktop\\Classification &amp; Clustering&quot;)

train_data&lt;-read.csv(&quot;data/train.csv&quot;)
test_data&lt;-read.csv(&quot;data/test.csv&quot;)

#Summary Statistics
library(Hmisc)

head(train_data)
str(train_data)
summary(train_data)
describe(train_data)

head(test_data)
str(test_data)
summary(test_data)
describe(test_data)

# 2-way contingency tables
xtabs(~admit + prestige, data = train_data)

##############################################################################
# Outlier Detection

sapply(train_data[,1:3], function(x) quantile(x, c(.01,.05,.25,.5,.75,.90,.95, .99, 1),na.rm=TRUE) )

#gpa of 6.5 seems to be an outlier
train_data$gpa[train_data$gpa &gt; 4] &lt;- 4

##############################################################################
# Missing Value Imputation

sapply(train_data, function(x) sum(is.na(x)) )
train_data$gre[is.na(train_data$gre)] &lt;- mean(train_data$gre, na.rm=TRUE)

train_data2&lt;-train_data

sapply(train_data2, function(x) train_data2[,x][is.na(train_data2[,x])]&lt;- mean(train_data2[,x], na.rm=TRUE))
##############################################################################
# Correlation and VIF

cor(train_data[,1:3])

library(usdm)
vif(train_data[,1:3])

##############################################################################
# Information Value

library(plyr)
library(sqldf)
library(rpart)

source(&quot;C:\\xyz.R&quot;)

file.sources = list.files(&quot;others&quot;, full.names=TRUE)
sapply(file.sources,source,.GlobalEnv)

data &lt;- train_data
data$admit &lt;- factor(data$admit, levels= c(&quot;1&quot;,&quot;0&quot;))
levels(data$admit)

str(data)
iv.mult(data, y=&quot;admit&quot;, vars=c(&quot;gre&quot;,&quot;gpa&quot;,&quot;prestige&quot;), summary=&quot;TRUE&quot;)

##############################################################################
# Create training and validation sets

set.seed(123)
smp_size &lt;- floor(0.7 * nrow(train_data))

train_ind &lt;- sample(seq_len(nrow(train_data)), size = smp_size)

training &lt;- train_data[train_ind, ]
validation &lt;- train_data[-train_ind, ]

##############################################################################
# Running the Logistic Model on Training set

?lm
?describe
?glm

admit ~ gre + gpa + prestige

mylogit &lt;- glm(admit ~ gre + gpa + prestige, data = training, family = &quot;binomial&quot;)

mylogit2 &lt;- glm(admit ~ gpa + prestige, data = training, family = &quot;binomial&quot;)

summary(mylogit2)
# See how prestige has been used as a dummy variable

confint(mylogit, level=.90)

# Caluclating Concordance
# Refer to the blog here to see about Concordance
# http://shashiasrblog.blogspot.in/2014/02/binary-logistic-regression-fast.html

fastConc&lt;-function(model){
 # Get all actual observations and their fitted values into a frame
 fitted&lt;-data.frame(cbind(model$y,model$fitted.values))
 colnames(fitted)&lt;-c('respvar','score')
 # Subset only ones
 ones&lt;-fitted[fitted[,1]==1,]
 # Subset only zeros
 zeros&lt;-fitted[fitted[,1]==0,]

 # Initialise all the values
 pairs_tested&lt;-nrow(ones)*nrow(zeros)
 conc&lt;-0
 disc&lt;-0

 # Get the values in a for-loop
 for(i in 1:nrow(ones))
 {
 conc&lt;-conc + sum(ones[i,&quot;score&quot;]&gt;zeros[,&quot;score&quot;])
 disc&lt;-disc + sum(ones[i,&quot;score&quot;]&lt;zeros[,&quot;score&quot;])
 }
 # Calculate concordance, discordance and ties
 concordance&lt;-conc/pairs_tested
 discordance&lt;-disc/pairs_tested
 ties_perc&lt;-(1-concordance-discordance)
 return(list(&quot;Concordance&quot;=concordance,
 &quot;Discordance&quot;=discordance,
 &quot;Tied&quot;=ties_perc,
 &quot;Pairs&quot;=pairs_tested))
}

fastConc(mylogit)
##############################################################################
#Check Performance on the Validation Set

val &lt;-predict(mylogit, validation, type=&quot;response&quot;)

mydf &lt;-cbind(validation,val)

mydf$response &lt;- as.factor(ifelse(mydf$val&gt;0.5, 1, 0))

library(ROCR)
logit_scores &lt;- prediction(predictions=mydf$val, labels=mydf$admit)

#PLOT ROC CURVE
logit_perf &lt;- performance(logit_scores, &quot;tpr&quot;, &quot;fpr&quot;)
plot(logit_perf,col = &quot;darkblue&quot;,lwd=2,xaxs=&quot;i&quot;,yaxs=&quot;i&quot;,tck=NA, main=&quot;ROC Curve&quot;)
box()
abline(0,1, lty = 300, col = &quot;green&quot;)
grid(col=&quot;aquamarine&quot;)

### AREA UNDER THE CURVE
logit_auc &lt;- performance(logit_scores, &quot;auc&quot;)
as.numeric(logit_auc@y.values) ##AUC Value

#CONFUSION MATRIX
library(caret)
confusionMatrix(mydf$response,mydf$admit)

### KS STATISTIC
logit_ks &lt;- max(logit_perf@y.values[[1]]-logit_perf@x.values[[1]])
logit_ks

## LIFT CHART
lift.obj &lt;- performance(logit_scores, measure=&quot;lift&quot;, x.measure=&quot;rpp&quot;)
plot(lift.obj,
 main=&quot;Lift Chart&quot;,
 xlab=&quot;% Populations&quot;,
 ylab=&quot;Lift&quot;,
 col=&quot;blue&quot;)
abline(1,0,col=&quot;grey&quot;)

#GAINS TABLE
#install.packages(&quot;gains&quot;)
library(gains)
# gains table
gains.cross &lt;- gains(actual=mydf$admit , predicted=mydf$val, groups=10)
print(gains.cross)

##############################################################################
#Scoring the Test Data using the model we just created

pred &lt;- predict(mylogit, test_data, type=&quot;response&quot;)
final &lt;- cbind(test_data,pred)

write.csv(final,&quot;final_probs.csv&quot;)

##############################################################################
#REFERENCE MATERIAL

## http://www.ats.ucla.edu/stat/r/dae/logit.htm
## http://www.unc.edu/courses/2010fall/ecol/563/001/notes/lecture21%20Rcode.html
## Caret Package: http://topepo.github.io/caret/
## http://www.r-bloggers.com/gini-index-and-lorenz-curve-with-r/

2 thoughts on “End to end Logistic Regression in R”

José de França says:

November 25, 2018 at 11:44 pm

It is not clear for me the part of Information Value.

I installed de plyr, rpart and sqldf packages but R sent the message:

iv.mult(data,y=”admit”,vars=c(“gre”,”gpa”,”prestige”),summary=”TRUE”)
Error in iv.mult(data, y = “admit”, vars = c(“gre”, “gpa”, “prestige”), :
could not find function “iv.mult”

Please, could you suggest links about Information Value and R?
Thank you.

LikeLike

Rose says:

November 6, 2022 at 2:17 pm

Loved readingg this thanks

LikeLike

Ujjwal Karn's blog

deep learning, computer vision, nlp

End to end Logistic Regression in R

2 thoughts on “End to end Logistic Regression in R”

Leave a comment Cancel reply

Ujjwal Karn's blog

deep learning, computer vision, nlp

Share this:

Related

2 thoughts on “End to end Logistic Regression in R”

Leave a comment Cancel reply