Predictive Modeling

It’s also known as Machine Learning. It’s two common types: Supervised and Unsupervised Learning.

# load libraries
library(ggplot2)
library(dplyr)

# load the data
credit <- read.csv("credit.csv") 

Data Cleaning/ Manipulation

new_credit <- select(credit,-ID)
head(new_credit)

Split the data

train_index<- sample(1:nrow(new_credit),0.75*nrow(new_credit))
test_index<- setdiff(1:nrow(new_credit),train_index)
train_df <- new_credit[train_index,]
test_df <- new_credit[test_index,-11]
test_req <- as.data.frame(new_credit[test_index,"Balance"])
train_df
modfit<- lm(Balance~Rating,data= train_df)
summary(modfit)
## 
## Call:
## lm(formula = Balance ~ Rating, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -698.50 -137.14   -8.25  139.05  790.09 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -383.44503   33.45595  -11.46   <2e-16 ***
## Rating         2.53293    0.08433   30.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 234.3 on 298 degrees of freedom
## Multiple R-squared:  0.7517, Adjusted R-squared:  0.7509 
## F-statistic: 902.2 on 1 and 298 DF,  p-value: < 2.2e-16
test_req$Prediction<-predict(modfit,test_df)
test_req
names(test_req)<- c("Expected","Predicted")
ggplot(data=train_df,aes(Balance,Rating))+geom_point()+geom_smooth(method = lm)

ggplot(data= test_req,aes(x=Expected,y=Predicted))+geom_point()

Mutiple Linear Regression

Feature Engineering

new_credit
new_credit$Caucasian <- ifelse(credit$Ethnicity=="Caucasian",1,0)
new_credit$Asian <- ifelse(credit$Ethnicity=="Asian",1,0)
new_credit$African_American<- ifelse(credit$Ethnicity=="African American",1,0)
head(new_credit)
new_credit<- select(new_credit,-Ethnicity)
head(new_credit)
new_credit$Cards<- as.integer(new_credit$Cards)

Split Data

set.seed(123)
train_index<- sample(1:nrow(new_credit),0.75*nrow(new_credit))
test_index<- setdiff(1:nrow(new_credit),train_index)

train_df <- new_credit[train_index,]
test_df <- new_credit[test_index,-10]
test_req <- as.data.frame(new_credit[test_index,"Balance"])
model_fit1 <- lm(Balance~.,train_df)
summary(model_fit1)
## 
## Call:
## lm(formula = Balance ~ ., data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -181.66  -80.68  -13.03   54.43  285.34 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -475.77857   41.99065 -11.331  < 2e-16 ***
## Income             -7.68751    0.27376 -28.081  < 2e-16 ***
## Limit               0.22191    0.03919   5.663 3.60e-08 ***
## Rating              0.63618    0.58721   1.083    0.280    
## Cards              21.97988    5.24367   4.192 3.69e-05 ***
## Age                -0.65930    0.34223  -1.927    0.055 .  
## Education          -1.00014    1.84846  -0.541    0.589    
## GenderFemale       -9.77202   11.61450  -0.841    0.401    
## StudentYes        439.73026   19.69464  22.327  < 2e-16 ***
## MarriedYes          8.72569   12.19082   0.716    0.475    
## Caucasian           5.21212   14.31662   0.364    0.716    
## Asian              23.56744   16.37304   1.439    0.151    
## African_American         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.81 on 288 degrees of freedom
## Multiple R-squared:  0.9561, Adjusted R-squared:  0.9544 
## F-statistic: 570.3 on 11 and 288 DF,  p-value: < 2.2e-16
model.fit2<- lm(Balance~Income+Limit+Cards+Age+Student,train_df)
summary(model.fit2)
## 
## Call:
## lm(formula = Balance ~ Income + Limit + Cards + Age + Student, 
##     data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -190.17  -76.67  -14.40   61.38  298.42 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.632e+02  2.555e+01 -18.126  < 2e-16 ***
## Income      -7.656e+00  2.724e-01 -28.107  < 2e-16 ***
## Limit        2.641e-01  4.165e-03  63.396  < 2e-16 ***
## Cards        2.566e+01  4.183e+00   6.134 2.76e-09 ***
## Age         -7.199e-01  3.402e-01  -2.116   0.0352 *  
## StudentYes   4.391e+02  1.926e+01  22.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.71 on 294 degrees of freedom
## Multiple R-squared:  0.9553, Adjusted R-squared:  0.9545 
## F-statistic:  1256 on 5 and 294 DF,  p-value: < 2.2e-16
predict(model.fit2,test_df)
##          2          4          5          6         12         13 
##  936.97274  957.50011  404.52754 1095.17076 -201.26351  305.96770 
##         14         15         16         19         22         23 
## 1020.69325  256.64307   85.74363  848.66325  908.97216  110.74859 
##         24         30         32         35         37         38 
##  508.00748  876.28903  134.29390  114.60861  764.32087 1020.18803 
##         42         45         54         60         63         64 
## 1132.15957 1035.33365  910.53349  582.01815 -260.92425  268.95338 
##         74         76         77         87         91        101 
##  627.87532  154.95785  587.82944  789.06481  977.53841  357.59382 
##        105        111        112        117        118        135 
##  739.07592  172.13211  107.56862 -141.10582 1235.69206  833.84274 
##        138        142        144        147        153        158 
##  290.02040  644.54623  710.28699  793.03649  304.05883 1002.71319 
##        159        169        171        186        188        189 
##  669.71138   73.82194 -159.75682  481.00203  -69.31945  910.94931 
##        193        205        218        219        223        226 
##  419.48018  519.00723  896.30671  329.73924 1430.03077 1001.92963 
##        228        229        234        235        237        239 
##  549.60937  266.84038   75.40573 1228.01996  552.57209  214.33566 
##        241        242        251        253        260        261 
##  271.48503 -317.97704  116.65800 1281.08525 1029.68880  442.93954 
##        265        275        282        288        293        295 
##  657.14746  922.75028 -197.59635  -54.90795  517.01521  -50.70160 
##        297        300        301        303        306        311 
##  380.05407  702.13478  582.87736  381.98494 -127.04399  997.81600 
##        313        319        321        323        325        328 
##  812.69207 -198.17572  178.48386  365.61600  498.25898  947.68647 
##        330        335        337        342        343        348 
##  835.95655  612.49020  892.75912  467.07771  307.06534 1150.03056 
##        355        368        370        376        377        389 
##  461.16384  310.21049 1143.95212  970.93809  776.79386 1043.38053 
##        390        394        398        399 
##  788.69069 -238.68826  275.26615  -91.24959
scaled_train<-scale(select(train_df,-Gender,-Student,-Married,-Caucasian,-Asian,-African_American,-Balance),center = TRUE, scale = TRUE)%>%as.data.frame()
scaled_train$Student<- train_df$Student
scaled_train$Balance <- train_df$Balance
scaled_test<- scale(select(test_df,-Gender,-Student,-Married,-Asian,-African_American),center = TRUE, scale=TRUE)%>%as.data.frame()
scaled_test$Student <- test_df$Student
test_df
model_fit_scaled<- lm(Balance~.,scaled_train)
summary(model_fit_scaled)
## 
## Call:
## lm(formula = Balance ~ ., data = scaled_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -177.09  -77.16  -11.53   59.19  299.16 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  473.273      6.077  77.882  < 2e-16 ***
## Income      -285.456     10.157 -28.104  < 2e-16 ***
## Limit        519.809     92.057   5.647 3.88e-08 ***
## Rating       111.782     92.580   1.207   0.2282    
## Cards         30.138      7.193   4.190 3.70e-05 ***
## Age          -12.235      5.982  -2.046   0.0417 *  
## Education     -2.759      5.797  -0.476   0.6345    
## StudentYes   437.969     19.385  22.593  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.75 on 292 degrees of freedom
## Multiple R-squared:  0.9555, Adjusted R-squared:  0.9545 
## F-statistic: 896.6 on 7 and 292 DF,  p-value: < 2.2e-16
model_fit_scaled<- lm(Balance~Income+Limit+Cards+Age+Student,scaled_train)
summary(model_fit_scaled)
## 
## Call:
## lm(formula = Balance ~ Income + Limit + Cards + Age + Student, 
##     data = scaled_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -190.17  -76.67  -14.40   61.38  298.42 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  473.155      6.070  77.944  < 2e-16 ***
## Income      -284.425     10.119 -28.107  < 2e-16 ***
## Limit        630.376      9.943  63.396  < 2e-16 ***
## Cards         35.421      5.774   6.134 2.76e-09 ***
## Age          -12.634      5.971  -2.116   0.0352 *  
## StudentYes   439.148     19.261  22.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.71 on 294 degrees of freedom
## Multiple R-squared:  0.9553, Adjusted R-squared:  0.9545 
## F-statistic:  1256 on 5 and 294 DF,  p-value: < 2.2e-16
test_req$predict<-predict(model_fit_scaled,scaled_test)
names(test_req)<-c("Expected","Predicted")
test_req