library(randomForest)
library(ggplot2)
library(dplyr)
library(stringr)
library(kernlab)
library(class)
red_wine <- read.csv("winequality-red.csv",sep = ";")
white_wine <- read.csv("winequality-white.csv",sep = ";")
bank <- read.csv("bank.csv", sep = ";")
print("Summary of red_wine and white_wine")
[1] "Summary of red_wine and white_wine"
summary(red_wine)
fixed.acidity volatile.acidity
Min. : 4.60 Min. :0.1200
1st Qu.: 7.10 1st Qu.:0.3900
Median : 7.90 Median :0.5200
Mean : 8.32 Mean :0.5278
3rd Qu.: 9.20 3rd Qu.:0.6400
Max. :15.90 Max. :1.5800
citric.acid residual.sugar
Min. :0.000 Min. : 0.900
1st Qu.:0.090 1st Qu.: 1.900
Median :0.260 Median : 2.200
Mean :0.271 Mean : 2.539
3rd Qu.:0.420 3rd Qu.: 2.600
Max. :1.000 Max. :15.500
chlorides free.sulfur.dioxide
Min. :0.01200 Min. : 1.00
1st Qu.:0.07000 1st Qu.: 7.00
Median :0.07900 Median :14.00
Mean :0.08747 Mean :15.87
3rd Qu.:0.09000 3rd Qu.:21.00
Max. :0.61100 Max. :72.00
total.sulfur.dioxide density
Min. : 6.00 Min. :0.9901
1st Qu.: 22.00 1st Qu.:0.9956
Median : 38.00 Median :0.9968
Mean : 46.47 Mean :0.9967
3rd Qu.: 62.00 3rd Qu.:0.9978
Max. :289.00 Max. :1.0037
pH sulphates
Min. :2.740 Min. :0.3300
1st Qu.:3.210 1st Qu.:0.5500
Median :3.310 Median :0.6200
Mean :3.311 Mean :0.6581
3rd Qu.:3.400 3rd Qu.:0.7300
Max. :4.010 Max. :2.0000
alcohol quality
Min. : 8.40 Min. :3.000
1st Qu.: 9.50 1st Qu.:5.000
Median :10.20 Median :6.000
Mean :10.42 Mean :5.636
3rd Qu.:11.10 3rd Qu.:6.000
Max. :14.90 Max. :8.000
summary(white_wine)
fixed.acidity volatile.acidity
Min. : 3.800 Min. :0.0800
1st Qu.: 6.300 1st Qu.:0.2100
Median : 6.800 Median :0.2600
Mean : 6.855 Mean :0.2782
3rd Qu.: 7.300 3rd Qu.:0.3200
Max. :14.200 Max. :1.1000
citric.acid residual.sugar
Min. :0.0000 Min. : 0.600
1st Qu.:0.2700 1st Qu.: 1.700
Median :0.3200 Median : 5.200
Mean :0.3342 Mean : 6.391
3rd Qu.:0.3900 3rd Qu.: 9.900
Max. :1.6600 Max. :65.800
chlorides free.sulfur.dioxide
Min. :0.00900 Min. : 2.00
1st Qu.:0.03600 1st Qu.: 23.00
Median :0.04300 Median : 34.00
Mean :0.04577 Mean : 35.31
3rd Qu.:0.05000 3rd Qu.: 46.00
Max. :0.34600 Max. :289.00
total.sulfur.dioxide density
Min. : 9.0 Min. :0.9871
1st Qu.:108.0 1st Qu.:0.9917
Median :134.0 Median :0.9937
Mean :138.4 Mean :0.9940
3rd Qu.:167.0 3rd Qu.:0.9961
Max. :440.0 Max. :1.0390
pH sulphates
Min. :2.720 Min. :0.2200
1st Qu.:3.090 1st Qu.:0.4100
Median :3.180 Median :0.4700
Mean :3.188 Mean :0.4898
3rd Qu.:3.280 3rd Qu.:0.5500
Max. :3.820 Max. :1.0800
alcohol quality
Min. : 8.00 Min. :3.000
1st Qu.: 9.50 1st Qu.:5.000
Median :10.40 Median :6.000
Mean :10.51 Mean :5.878
3rd Qu.:11.40 3rd Qu.:6.000
Max. :14.20 Max. :9.000
print("Summary of bank")
[1] "Summary of bank"
summary(bank)
age job
Min. :19.00 Length:4521
1st Qu.:33.00 Class :character
Median :39.00 Mode :character
Mean :41.17
3rd Qu.:49.00
Max. :87.00
marital education
Length:4521 Length:4521
Class :character Class :character
Mode :character Mode :character
default balance
Length:4521 Min. :-3313
Class :character 1st Qu.: 69
Mode :character Median : 444
Mean : 1423
3rd Qu.: 1480
Max. :71188
housing loan
Length:4521 Length:4521
Class :character Class :character
Mode :character Mode :character
contact day
Length:4521 Min. : 1.00
Class :character 1st Qu.: 9.00
Mode :character Median :16.00
Mean :15.92
3rd Qu.:21.00
Max. :31.00
month duration
Length:4521 Min. : 4
Class :character 1st Qu.: 104
Mode :character Median : 185
Mean : 264
3rd Qu.: 329
Max. :3025
campaign pdays
Min. : 1.000 Min. : -1.00
1st Qu.: 1.000 1st Qu.: -1.00
Median : 2.000 Median : -1.00
Mean : 2.794 Mean : 39.77
3rd Qu.: 3.000 3rd Qu.: -1.00
Max. :50.000 Max. :871.00
previous poutcome
Min. : 0.0000 Length:4521
1st Qu.: 0.0000 Class :character
Median : 0.0000 Mode :character
Mean : 0.5426
3rd Qu.: 0.0000
Max. :25.0000
y
Length:4521
Class :character
Mode :character
The columns job, martial, education, default, housing, loan, contact, month, poutcome, and y need to be converted from characters.
#Bank Dataset
#Fixing Bank Data
#Fixing Martial + Others to Numeric
bank_numeric <- bank %>% mutate(
marital = case_when(marital == "married" ~ 1,
marital == "single" ~ 0,
marital == "divorced" ~ -1),
education = case_when(education == "primary" ~ 1,
education == "secondary" ~ 2,
education == "tertiary" ~ 3,
education == "unknown" ~ NA_real_),
default = case_when(default == "yes" ~ 1,
default == "no" ~ 0),
housing = case_when(housing == "yes" ~ 1,
housing == "no" ~ 0),
loan = case_when(loan == "yes" ~ 1,
loan == "no" ~ 0),
contact = case_when(contact == "cellular" ~ 1,
contact == "telephone" ~ 2,
contact == "unknown" ~ NA_real_),
poutcome = case_when(poutcome == "success" ~ 1,
poutcome == "other" ~ 0,
poutcome == "failure" ~ -1,
poutcome == "unknown" ~ 0),
y = case_when(y == "yes" ~ 1,
y == "no" ~ 0)
)
#Fixing Months from abb to numbers
months <- str_to_title(bank$month)
bank_numeric$month <- match(months, month.abb)
#Fixing Job Column
bank_numeric$job <- as.factor(bank$job)
bank_numeric$job <- unclass(bank_numeric$job)
summary(bank_numeric)
age job
Min. :19.00 Min. : 1.000
1st Qu.:33.00 1st Qu.: 2.000
Median :39.00 Median : 5.000
Mean :41.17 Mean : 5.411
3rd Qu.:49.00 3rd Qu.: 8.000
Max. :87.00 Max. :12.000
marital education
Min. :-1.0000 Min. :1.000
1st Qu.: 0.0000 1st Qu.:2.000
Median : 1.0000 Median :2.000
Mean : 0.5019 Mean :2.155
3rd Qu.: 1.0000 3rd Qu.:3.000
Max. : 1.0000 Max. :3.000
NA's :187
default balance
Min. :0.00000 Min. :-3313
1st Qu.:0.00000 1st Qu.: 69
Median :0.00000 Median : 444
Mean :0.01681 Mean : 1423
3rd Qu.:0.00000 3rd Qu.: 1480
Max. :1.00000 Max. :71188
housing loan
Min. :0.000 Min. :0.0000
1st Qu.:0.000 1st Qu.:0.0000
Median :1.000 Median :0.0000
Mean :0.566 Mean :0.1528
3rd Qu.:1.000 3rd Qu.:0.0000
Max. :1.000 Max. :1.0000
contact day
Min. :1.000 Min. : 1.00
1st Qu.:1.000 1st Qu.: 9.00
Median :1.000 Median :16.00
Mean :1.094 Mean :15.92
3rd Qu.:1.000 3rd Qu.:21.00
Max. :2.000 Max. :31.00
NA's :1324
month duration
Min. : 1.000 Min. : 4
1st Qu.: 5.000 1st Qu.: 104
Median : 6.000 Median : 185
Mean : 6.167 Mean : 264
3rd Qu.: 8.000 3rd Qu.: 329
Max. :12.000 Max. :3025
campaign pdays
Min. : 1.000 Min. : -1.00
1st Qu.: 1.000 1st Qu.: -1.00
Median : 2.000 Median : -1.00
Mean : 2.794 Mean : 39.77
3rd Qu.: 3.000 3rd Qu.: -1.00
Max. :50.000 Max. :871.00
previous poutcome
Min. : 0.0000 Min. :-1.00000
1st Qu.: 0.0000 1st Qu.: 0.00000
Median : 0.0000 Median : 0.00000
Mean : 0.5426 Mean :-0.07985
3rd Qu.: 0.0000 3rd Qu.: 0.00000
Max. :25.0000 Max. : 1.00000
y
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.1152
3rd Qu.:0.0000
Max. :1.0000
hist(bank_numeric$duration, breaks = c(0,10,60,120,300,600,1200,3025))
bank_numeric_lessthan10min <- bank_numeric %>% subset(duration <= 600)
bank_numeric_morethan10min <- bank_numeric %>% subset(duration > 600)
hist(bank_numeric_lessthan10min$duration)
hist(bank_numeric_morethan10min$duration)
boxplot(bank_numeric_lessthan10min$duration)
bank_numeric$y <- as.factor(bank_numeric$y)
x <- bank_numeric %>% subset(select = -c(contact,education,y))
y <- bank_numeric %>% subset(select = c(y))
x_train <- sample_n(x, 0.7*4521)
y_train <- sample_n(y, 0.7*4521)
x_test <- sample_n(x, 0.3*4521)
y_test <- sample_n(y, 0.3*4521)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
bank_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$y,
k = 5)
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.87094395280236"
bank_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$y,
k = 7)
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.881268436578171"
bank_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$y,
k = 9)
table(y_test$y, bank_knn)
bank_knn
0 1
0 1194 4
1 158 0
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.880530973451327"
bank_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$y,
k = 11)
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.883480825958702"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
svp_pred 0 1
0 1198 157
1 0 1
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
agreement
FALSE TRUE
0.1157817 0.8842183
#plot(svp, data = x)
bank.rf <- randomForest(y~., data = train,
ntree = 45,
importance = TRUE,
proximity = TRUE)
print(bank.rf)
Call:
randomForest(formula = y ~ ., data = train, ntree = 45, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 45
No. of variables tried at each split: 3
OOB estimate of error rate: 12.61%
Confusion matrix:
0 1 class.error
0 2765 12 0.00432121
1 387 0 1.00000000
plot(bank.rf)
pred <- predict(bank.rf, x_test)
table(pred, test$y)
pred 0 1
0 1112 143
1 86 15
agreement <- pred == test$y
prop.table(table(agreement))
agreement
FALSE TRUE
0.1688791 0.8311209
importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
#Wine Dataset
hist(as.numeric(white_wine$quality))
white_wine$quality <- as.factor(white_wine$quality)
x <- white_wine %>% subset(select = -c(quality))
y <- white_wine %>% subset(select = c(quality))
y <- y %>% mutate(
quality = case_when(quality == 3 | quality == 4 ~ -1,
quality == 5 | quality == 6 | quality == 7 ~ 0,
quality == 8 | quality == 9 ~ 1)
)
y$quality <- as.factor(y$quality)
summary(y)
quality
-1: 183
0 :4535
1 : 180
x_train <- sample_n(x, 0.7*4898)
y_train <- sample_n(y, 0.7*4898)
x_test <- sample_n(x, 0.3*4898)
y_test <- sample_n(y, 0.3*4898)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
white_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 5)
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.926480599046971"
white_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 7)
table(y_test$quality, white_wine_knn)
white_wine_knn
-1 0 1
-1 0 59 0
0 0 1363 0
1 0 47 0
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.92784206943499"
white_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 9)
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.92784206943499"
white_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 11)
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.92784206943499"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
svp_pred -1 0 1
-1 0 0 0
0 59 1363 47
1 0 0 0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
agreement
FALSE TRUE
0.07215793 0.92784207
#plot(svp, data = x)
white_wine.rf <- randomForest(quality~., data = train,
ntree = 40,
importance = TRUE,
proximity = TRUE)
print(white_wine.rf)
Call:
randomForest(formula = quality ~ ., data = train, ntree = 40, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 40
No. of variables tried at each split: 3
OOB estimate of error rate: 8.52%
Confusion matrix:
-1 0 1 class.error
-1 0 124 0 1.00000000
0 22 3134 20 0.01322418
1 0 126 2 0.98437500
plot(white_wine.rf)
pred <- predict(white_wine.rf, x_test)
table(pred, test$quality)
pred -1 0 1
-1 2 28 1
0 55 1305 46
1 2 30 0
agreement <- pred == test$quality
prop.table(table(agreement))
agreement
FALSE TRUE
0.1102791 0.8897209
importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
red_wine$quality <- as.factor(red_wine$quality)
x <- red_wine %>% subset(select = -c(quality))
y <- red_wine %>% subset(select = c(quality))
y <- y %>% mutate(
quality = case_when(quality == 3 | quality == 4 ~ -1,
quality == 5 | quality == 6 | quality == 7 ~ 0,
quality == 8 | quality == 9 ~ 1)
)
y$quality <- as.factor(y$quality)
summary(y)
quality
-1: 63
0 :1518
1 : 18
x_train <- sample_n(x, 0.7*1599)
y_train <- sample_n(y, 0.7*1599)
x_test <- sample_n(x, 0.3*1599)
y_test <- sample_n(y, 0.3*1599)
red_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 5)
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.937369519832985"
red_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 7)
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.935281837160752"
red_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 9)
table(y_test$quality, red_wine_knn)
red_wine_knn
-1 0 1
-1 0 21 0
0 0 449 0
1 0 9 0
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.937369519832985"
red_wine_knn <- knn(train = scale(x_train),
test = scale(x_test),
cl = y_train$quality,
k = 11)
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
[1] "Accuracy = 0.937369519832985"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)
svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
svp_pred -1 0 1
-1 0 0 0
0 21 449 9
1 0 0 0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
agreement
FALSE TRUE
0.06263048 0.93736952
#plot(svp, data = x)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
red_wine.rf <- randomForest(quality~., data = train,
ntree = 30,
importance = TRUE,
proximity = TRUE)
print(red_wine.rf)
Call:
randomForest(formula = quality ~ ., data = train, ntree = 30, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 30
No. of variables tried at each split: 3
OOB estimate of error rate: 6.7%
Confusion matrix:
-1 0 1 class.error
-1 0 49 0 1.00000000
0 12 1044 2 0.01323251
1 0 12 0 1.00000000
plot(red_wine.rf)
pred <- predict(red_wine.rf, x_test)
table(pred, test$quality)
pred -1 0 1
-1 1 10 0
0 20 439 9
1 0 0 0
agreement <- pred == test$quality
prop.table(table(agreement))
agreement
FALSE TRUE
0.08141962 0.91858038
importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()
importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
y=Importance,fill=Importance))+
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'white') +
labs(x = 'Variables') +
coord_flip()