library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(class)
red_wine <- read.csv("winequality-red.csv",sep = ";")
white_wine <- read.csv("winequality-white.csv",sep = ";")
bank <- read.csv("bank.csv", sep = ";")
print("Summary of red_wine and white_wine")
## [1] "Summary of red_wine and white_wine"
summary(red_wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
summary(white_wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.180 Median :0.4700 Median :10.40 Median :6.000
## Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
print("Summary of bank")
## [1] "Summary of bank"
summary(bank)
## age job marital education
## Min. :19.00 Length:4521 Length:4521 Length:4521
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :41.17
## 3rd Qu.:49.00
## Max. :87.00
## default balance housing loan
## Length:4521 Min. :-3313 Length:4521 Length:4521
## Class :character 1st Qu.: 69 Class :character Class :character
## Mode :character Median : 444 Mode :character Mode :character
## Mean : 1423
## 3rd Qu.: 1480
## Max. :71188
## contact day month duration
## Length:4521 Min. : 1.00 Length:4521 Min. : 4
## Class :character 1st Qu.: 9.00 Class :character 1st Qu.: 104
## Mode :character Median :16.00 Mode :character Median : 185
## Mean :15.92 Mean : 264
## 3rd Qu.:21.00 3rd Qu.: 329
## Max. :31.00 Max. :3025
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Length:4521
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.00 Median : 0.0000 Mode :character
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
## y
## Length:4521
## Class :character
## Mode :character
##
##
##
The columns job, martial, education, default, housing, loan, contact, month, poutcome, and y need to be converted from characters.
#Bank Dataset
#Fixing Bank Data
#Fixing Martial + Others to Numeric
bank_numeric <- bank %>% mutate(
marital = case_when(marital == "married" ~ 1,
marital == "single" ~ 0,
marital == "divorced" ~ -1),
education = case_when(education == "primary" ~ 1,
education == "secondary" ~ 2,
education == "tertiary" ~ 3,
education == "unknown" ~ NA_real_),
default = case_when(default == "yes" ~ 1,
default == "no" ~ 0),
housing = case_when(housing == "yes" ~ 1,
housing == "no" ~ 0),
loan = case_when(loan == "yes" ~ 1,
loan == "no" ~ 0),
contact = case_when(contact == "cellular" ~ 1,
contact == "telephone" ~ 2,
contact == "unknown" ~ NA_real_),
poutcome = case_when(poutcome == "success" ~ 1,
poutcome == "other" ~ 0,
poutcome == "failure" ~ -1,
poutcome == "unknown" ~ 0),
y = case_when(y == "yes" ~ 1,
y == "no" ~ 0)
)
#Fixing Months from abb to numbers
months <- str_to_title(bank$month)
bank_numeric$month <- match(months, month.abb)
#Fixing Job Column
bank_numeric$job <- as.factor(bank$job)
bank_numeric$job <- unclass(bank_numeric$job)
summary(bank_numeric)
## age job marital education
## Min. :19.00 Min. : 1.000 Min. :-1.0000 Min. :1.000
## 1st Qu.:33.00 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.:2.000
## Median :39.00 Median : 5.000 Median : 1.0000 Median :2.000
## Mean :41.17 Mean : 5.411 Mean : 0.5019 Mean :2.155
## 3rd Qu.:49.00 3rd Qu.: 8.000 3rd Qu.: 1.0000 3rd Qu.:3.000
## Max. :87.00 Max. :12.000 Max. : 1.0000 Max. :3.000
## NA's :187
## default balance housing loan
## Min. :0.00000 Min. :-3313 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 69 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median : 444 Median :1.000 Median :0.0000
## Mean :0.01681 Mean : 1423 Mean :0.566 Mean :0.1528
## 3rd Qu.:0.00000 3rd Qu.: 1480 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.00000 Max. :71188 Max. :1.000 Max. :1.0000
##
## contact day month duration
## Min. :1.000 Min. : 1.00 Min. : 1.000 Min. : 4
## 1st Qu.:1.000 1st Qu.: 9.00 1st Qu.: 5.000 1st Qu.: 104
## Median :1.000 Median :16.00 Median : 6.000 Median : 185
## Mean :1.094 Mean :15.92 Mean : 6.167 Mean : 264
## 3rd Qu.:1.000 3rd Qu.:21.00 3rd Qu.: 8.000 3rd Qu.: 329
## Max. :2.000 Max. :31.00 Max. :12.000 Max. :3025
## NA's :1324
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Min. :-1.00000
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 1st Qu.: 0.00000
## Median : 2.000 Median : -1.00 Median : 0.0000 Median : 0.00000
## Mean : 2.794 Mean : 39.77 Mean : 0.5426 Mean :-0.07985
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000 3rd Qu.: 0.00000
## Max. :50.000 Max. :871.00 Max. :25.0000 Max. : 1.00000
##
## y
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1152
## 3rd Qu.:0.0000
## Max. :1.0000
##
hist(bank_numeric$duration, breaks = c(0,10,60,120,300,600,1200,3025))
bank_numeric_lessthan10min <- bank_numeric %>% subset(duration <= 600)
bank_numeric_morethan10min <- bank_numeric %>% subset(duration > 600)
hist(bank_numeric_lessthan10min$duration)
hist(bank_numeric_morethan10min$duration)
boxplot(bank_numeric_lessthan10min$duration)
bank_numeric$y <- as.factor(bank_numeric$y)
x <- bank_numeric %>% subset(select = -c(contact,education,y))
y <- bank_numeric %>% subset(select = c(y))
x_train <- sample_n(x, 0.7*4521)
y_train <- sample_n(y, 0.7*4521)
x_test <- sample_n(x, 0.3*4521)
y_test <- sample_n(y, 0.3*4521)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
bank_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$y,
k = 9)
table(y_test$y, bank_knn)
## bank_knn
## 0 1
## 0 1212 1
## 1 143 0
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy = 0.893805309734513"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##
## svp_pred 0 1
## 0 1212 143
## 1 1 0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.1061947 0.8938053
#plot(svp, data = x)
bank.rf <- randomForest(y~., data = train,
importance = TRUE,
proximity = TRUE)
print(bank.rf)
##
## Call:
## randomForest(formula = y ~ ., data = train, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 11.13%
## Confusion matrix:
## 0 1 class.error
## 0 2812 1 0.0003554924
## 1 351 0 1.0000000000
plot(bank.rf)
pred <- predict(bank.rf, x_test)
table(pred, test$y)
##
## pred 0 1
## 0 1122 130
## 1 91 13
agreement <- pred == test$y
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.1629794 0.8370206
importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
#Wine Dataset
hist(as.numeric(white_wine$quality))
white_wine$quality <- as.factor(white_wine$quality)
x <- white_wine %>% subset(select = -c(quality))
y <- white_wine %>% subset(select = c(quality))
y <- y %>% mutate(
quality = case_when(quality == 3 | quality == 4 ~ -1,
quality == 5 | quality == 6 | quality == 7 ~ 0,
quality == 8 | quality == 9 ~ 1)
)
y$quality <- as.factor(y$quality)
summary(y)
## quality
## -1: 183
## 0 :4535
## 1 : 180
x_train <- sample_n(x, 0.7*4898)
y_train <- sample_n(y, 0.7*4898)
x_test <- sample_n(x, 0.3*4898)
y_test <- sample_n(y, 0.3*4898)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
white_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 7)
table(y_test$quality, white_wine_knn)
## white_wine_knn
## -1 0 1
## -1 0 53 0
## 0 0 1358 0
## 1 0 58 0
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy = 0.924438393464942"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##
## svp_pred -1 0 1
## -1 0 0 0
## 0 53 1358 58
## 1 0 0 0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.07556161 0.92443839
#plot(svp, data = x)
white_wine.rf <- randomForest(quality~., data = train,
ntree = 40,
importance = TRUE,
proximity = TRUE)
print(white_wine.rf)
##
## Call:
## randomForest(formula = quality ~ ., data = train, ntree = 40, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 40
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 8.93%
## Confusion matrix:
## -1 0 1 class.error
## -1 2 124 0 0.98412698
## 0 25 3120 30 0.01732283
## 1 0 127 0 1.00000000
plot(white_wine.rf)
pred <- predict(white_wine.rf, x_test)
table(pred, test$quality)
##
## pred -1 0 1
## -1 1 36 1
## 0 52 1294 56
## 1 0 28 1
agreement <- pred == test$quality
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.1177672 0.8822328
importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
red_wine$quality <- as.factor(red_wine$quality)
x <- red_wine %>% subset(select = -c(quality))
y <- red_wine %>% subset(select = c(quality))
y <- y %>% mutate(
quality = case_when(quality == 3 | quality == 4 ~ -1,
quality == 5 | quality == 6 | quality == 7 ~ 0,
quality == 8 | quality == 9 ~ 1)
)
y$quality <- as.factor(y$quality)
summary(y)
## quality
## -1: 63
## 0 :1518
## 1 : 18
x_train <- sample_n(x, 0.7*1599)
y_train <- sample_n(y, 0.7*1599)
x_test <- sample_n(x, 0.3*1599)
y_test <- sample_n(y, 0.3*1599)
red_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 7)
table(y_test$quality, red_wine_knn)
## red_wine_knn
## -1 0 1
## -1 0 22 0
## 0 0 454 0
## 1 0 3 0
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy = 0.947807933194154"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##
## svp_pred -1 0 1
## -1 0 0 0
## 0 22 454 3
## 1 0 0 0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.05219207 0.94780793
#plot(svp, data = x)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
red_wine.rf <- randomForest(quality~., data = train,
ntree = 30,
importance = TRUE,
proximity = TRUE)
print(red_wine.rf)
##
## Call:
## randomForest(formula = quality ~ ., data = train, ntree = 30, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 30
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.79%
## Confusion matrix:
## -1 0 1 class.error
## -1 0 48 0 1.00000000
## 0 14 1043 2 0.01510859
## 1 0 12 0 1.00000000
plot(red_wine.rf)
pred <- predict(red_wine.rf, x_test)
table(pred, test$quality)
##
## pred -1 0 1
## -1 0 8 0
## 0 22 446 3
## 1 0 0 0
agreement <- pred == test$quality
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.06889353 0.93110647
importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()