It’s also known as Machine Learning. It’s two common types: Supervised and Unsupervised Learning.
# load libraries
library(ggplot2)
library(dplyr)
# load the data
credit <- read.csv("credit.csv")
new_credit <- select(credit,-ID)
head(new_credit)
train_index<- sample(1:nrow(new_credit),0.75*nrow(new_credit))
test_index<- setdiff(1:nrow(new_credit),train_index)
train_df <- new_credit[train_index,]
test_df <- new_credit[test_index,-11]
test_req <- as.data.frame(new_credit[test_index,"Balance"])
train_df
modfit<- lm(Balance~Rating,data= train_df)
summary(modfit)
##
## Call:
## lm(formula = Balance ~ Rating, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -698.50 -137.14 -8.25 139.05 790.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -383.44503 33.45595 -11.46 <2e-16 ***
## Rating 2.53293 0.08433 30.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 234.3 on 298 degrees of freedom
## Multiple R-squared: 0.7517, Adjusted R-squared: 0.7509
## F-statistic: 902.2 on 1 and 298 DF, p-value: < 2.2e-16
test_req$Prediction<-predict(modfit,test_df)
test_req
names(test_req)<- c("Expected","Predicted")
ggplot(data=train_df,aes(Balance,Rating))+geom_point()+geom_smooth(method = lm)
ggplot(data= test_req,aes(x=Expected,y=Predicted))+geom_point()
new_credit
new_credit$Caucasian <- ifelse(credit$Ethnicity=="Caucasian",1,0)
new_credit$Asian <- ifelse(credit$Ethnicity=="Asian",1,0)
new_credit$African_American<- ifelse(credit$Ethnicity=="African American",1,0)
head(new_credit)
new_credit<- select(new_credit,-Ethnicity)
head(new_credit)
new_credit$Cards<- as.integer(new_credit$Cards)
set.seed(123)
train_index<- sample(1:nrow(new_credit),0.75*nrow(new_credit))
test_index<- setdiff(1:nrow(new_credit),train_index)
train_df <- new_credit[train_index,]
test_df <- new_credit[test_index,-10]
test_req <- as.data.frame(new_credit[test_index,"Balance"])
model_fit1 <- lm(Balance~.,train_df)
summary(model_fit1)
##
## Call:
## lm(formula = Balance ~ ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181.66 -80.68 -13.03 54.43 285.34
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -475.77857 41.99065 -11.331 < 2e-16 ***
## Income -7.68751 0.27376 -28.081 < 2e-16 ***
## Limit 0.22191 0.03919 5.663 3.60e-08 ***
## Rating 0.63618 0.58721 1.083 0.280
## Cards 21.97988 5.24367 4.192 3.69e-05 ***
## Age -0.65930 0.34223 -1.927 0.055 .
## Education -1.00014 1.84846 -0.541 0.589
## GenderFemale -9.77202 11.61450 -0.841 0.401
## StudentYes 439.73026 19.69464 22.327 < 2e-16 ***
## MarriedYes 8.72569 12.19082 0.716 0.475
## Caucasian 5.21212 14.31662 0.364 0.716
## Asian 23.56744 16.37304 1.439 0.151
## African_American NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.81 on 288 degrees of freedom
## Multiple R-squared: 0.9561, Adjusted R-squared: 0.9544
## F-statistic: 570.3 on 11 and 288 DF, p-value: < 2.2e-16
model.fit2<- lm(Balance~Income+Limit+Cards+Age+Student,train_df)
summary(model.fit2)
##
## Call:
## lm(formula = Balance ~ Income + Limit + Cards + Age + Student,
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.17 -76.67 -14.40 61.38 298.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.632e+02 2.555e+01 -18.126 < 2e-16 ***
## Income -7.656e+00 2.724e-01 -28.107 < 2e-16 ***
## Limit 2.641e-01 4.165e-03 63.396 < 2e-16 ***
## Cards 2.566e+01 4.183e+00 6.134 2.76e-09 ***
## Age -7.199e-01 3.402e-01 -2.116 0.0352 *
## StudentYes 4.391e+02 1.926e+01 22.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.71 on 294 degrees of freedom
## Multiple R-squared: 0.9553, Adjusted R-squared: 0.9545
## F-statistic: 1256 on 5 and 294 DF, p-value: < 2.2e-16
predict(model.fit2,test_df)
## 2 4 5 6 12 13
## 936.97274 957.50011 404.52754 1095.17076 -201.26351 305.96770
## 14 15 16 19 22 23
## 1020.69325 256.64307 85.74363 848.66325 908.97216 110.74859
## 24 30 32 35 37 38
## 508.00748 876.28903 134.29390 114.60861 764.32087 1020.18803
## 42 45 54 60 63 64
## 1132.15957 1035.33365 910.53349 582.01815 -260.92425 268.95338
## 74 76 77 87 91 101
## 627.87532 154.95785 587.82944 789.06481 977.53841 357.59382
## 105 111 112 117 118 135
## 739.07592 172.13211 107.56862 -141.10582 1235.69206 833.84274
## 138 142 144 147 153 158
## 290.02040 644.54623 710.28699 793.03649 304.05883 1002.71319
## 159 169 171 186 188 189
## 669.71138 73.82194 -159.75682 481.00203 -69.31945 910.94931
## 193 205 218 219 223 226
## 419.48018 519.00723 896.30671 329.73924 1430.03077 1001.92963
## 228 229 234 235 237 239
## 549.60937 266.84038 75.40573 1228.01996 552.57209 214.33566
## 241 242 251 253 260 261
## 271.48503 -317.97704 116.65800 1281.08525 1029.68880 442.93954
## 265 275 282 288 293 295
## 657.14746 922.75028 -197.59635 -54.90795 517.01521 -50.70160
## 297 300 301 303 306 311
## 380.05407 702.13478 582.87736 381.98494 -127.04399 997.81600
## 313 319 321 323 325 328
## 812.69207 -198.17572 178.48386 365.61600 498.25898 947.68647
## 330 335 337 342 343 348
## 835.95655 612.49020 892.75912 467.07771 307.06534 1150.03056
## 355 368 370 376 377 389
## 461.16384 310.21049 1143.95212 970.93809 776.79386 1043.38053
## 390 394 398 399
## 788.69069 -238.68826 275.26615 -91.24959
scaled_train<-scale(select(train_df,-Gender,-Student,-Married,-Caucasian,-Asian,-African_American,-Balance),center = TRUE, scale = TRUE)%>%as.data.frame()
scaled_train$Student<- train_df$Student
scaled_train$Balance <- train_df$Balance
scaled_test<- scale(select(test_df,-Gender,-Student,-Married,-Asian,-African_American),center = TRUE, scale=TRUE)%>%as.data.frame()
scaled_test$Student <- test_df$Student
test_df
model_fit_scaled<- lm(Balance~.,scaled_train)
summary(model_fit_scaled)
##
## Call:
## lm(formula = Balance ~ ., data = scaled_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -177.09 -77.16 -11.53 59.19 299.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 473.273 6.077 77.882 < 2e-16 ***
## Income -285.456 10.157 -28.104 < 2e-16 ***
## Limit 519.809 92.057 5.647 3.88e-08 ***
## Rating 111.782 92.580 1.207 0.2282
## Cards 30.138 7.193 4.190 3.70e-05 ***
## Age -12.235 5.982 -2.046 0.0417 *
## Education -2.759 5.797 -0.476 0.6345
## StudentYes 437.969 19.385 22.593 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.75 on 292 degrees of freedom
## Multiple R-squared: 0.9555, Adjusted R-squared: 0.9545
## F-statistic: 896.6 on 7 and 292 DF, p-value: < 2.2e-16
model_fit_scaled<- lm(Balance~Income+Limit+Cards+Age+Student,scaled_train)
summary(model_fit_scaled)
##
## Call:
## lm(formula = Balance ~ Income + Limit + Cards + Age + Student,
## data = scaled_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.17 -76.67 -14.40 61.38 298.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 473.155 6.070 77.944 < 2e-16 ***
## Income -284.425 10.119 -28.107 < 2e-16 ***
## Limit 630.376 9.943 63.396 < 2e-16 ***
## Cards 35.421 5.774 6.134 2.76e-09 ***
## Age -12.634 5.971 -2.116 0.0352 *
## StudentYes 439.148 19.261 22.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.71 on 294 degrees of freedom
## Multiple R-squared: 0.9553, Adjusted R-squared: 0.9545
## F-statistic: 1256 on 5 and 294 DF, p-value: < 2.2e-16
test_req$predict<-predict(model_fit_scaled,scaled_test)
names(test_req)<-c("Expected","Predicted")
test_req