library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(class)
red_wine <- read.csv("winequality-red.csv",sep = ";")
white_wine <- read.csv("winequality-white.csv",sep = ";")
bank <- read.csv("bank.csv", sep = ";")
print("Summary of red_wine and white_wine")
## [1] "Summary of red_wine and white_wine"
summary(red_wine)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
summary(white_wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000
print("Summary of bank")
## [1] "Summary of bank"
summary(bank)
##       age            job              marital           education        
##  Min.   :19.00   Length:4521        Length:4521        Length:4521       
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :41.17                                                           
##  3rd Qu.:49.00                                                           
##  Max.   :87.00                                                           
##    default             balance        housing              loan          
##  Length:4521        Min.   :-3313   Length:4521        Length:4521       
##  Class :character   1st Qu.:   69   Class :character   Class :character  
##  Mode  :character   Median :  444   Mode  :character   Mode  :character  
##                     Mean   : 1423                                        
##                     3rd Qu.: 1480                                        
##                     Max.   :71188                                        
##    contact               day           month              duration   
##  Length:4521        Min.   : 1.00   Length:4521        Min.   :   4  
##  Class :character   1st Qu.: 9.00   Class :character   1st Qu.: 104  
##  Mode  :character   Median :16.00   Mode  :character   Median : 185  
##                     Mean   :15.92                      Mean   : 264  
##                     3rd Qu.:21.00                      3rd Qu.: 329  
##                     Max.   :31.00                      Max.   :3025  
##     campaign          pdays           previous         poutcome        
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000   Length:4521       
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000   Class :character  
##  Median : 2.000   Median : -1.00   Median : 0.0000   Mode  :character  
##  Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426                     
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000                     
##  Max.   :50.000   Max.   :871.00   Max.   :25.0000                     
##       y            
##  Length:4521       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

The columns job, martial, education, default, housing, loan, contact, month, poutcome, and y need to be converted from characters.

#Bank Dataset

#Fixing Bank Data

#Fixing Martial + Others to Numeric
bank_numeric <- bank %>% mutate(
  marital = case_when(marital == "married" ~ 1,
                      marital == "single" ~ 0,
                      marital == "divorced" ~ -1),
  education = case_when(education == "primary" ~ 1, 
                        education == "secondary" ~ 2, 
                        education == "tertiary" ~ 3, 
                        education == "unknown" ~ NA_real_),
  default = case_when(default == "yes" ~ 1,
                      default == "no" ~ 0),
  housing = case_when(housing == "yes" ~ 1,
                      housing == "no" ~ 0),
  loan = case_when(loan == "yes" ~ 1, 
                   loan == "no" ~ 0),
  contact = case_when(contact == "cellular" ~ 1,
                      contact == "telephone" ~ 2,
                      contact == "unknown" ~ NA_real_),
  poutcome = case_when(poutcome == "success" ~ 1,
                       poutcome == "other" ~ 0,
                       poutcome == "failure" ~ -1, 
                       poutcome == "unknown" ~ 0),
  y = case_when(y == "yes" ~ 1,
                y == "no" ~ 0)
)

#Fixing Months from abb to numbers 

months <- str_to_title(bank$month)
bank_numeric$month <- match(months, month.abb)

#Fixing Job Column 
bank_numeric$job <- as.factor(bank$job)
bank_numeric$job <- unclass(bank_numeric$job)

summary(bank_numeric)
##       age             job            marital          education    
##  Min.   :19.00   Min.   : 1.000   Min.   :-1.0000   Min.   :1.000  
##  1st Qu.:33.00   1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.:2.000  
##  Median :39.00   Median : 5.000   Median : 1.0000   Median :2.000  
##  Mean   :41.17   Mean   : 5.411   Mean   : 0.5019   Mean   :2.155  
##  3rd Qu.:49.00   3rd Qu.: 8.000   3rd Qu.: 1.0000   3rd Qu.:3.000  
##  Max.   :87.00   Max.   :12.000   Max.   : 1.0000   Max.   :3.000  
##                                                     NA's   :187    
##     default           balance         housing           loan       
##  Min.   :0.00000   Min.   :-3313   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:   69   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.00000   Median :  444   Median :1.000   Median :0.0000  
##  Mean   :0.01681   Mean   : 1423   Mean   :0.566   Mean   :0.1528  
##  3rd Qu.:0.00000   3rd Qu.: 1480   3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :71188   Max.   :1.000   Max.   :1.0000  
##                                                                    
##     contact           day            month           duration   
##  Min.   :1.000   Min.   : 1.00   Min.   : 1.000   Min.   :   4  
##  1st Qu.:1.000   1st Qu.: 9.00   1st Qu.: 5.000   1st Qu.: 104  
##  Median :1.000   Median :16.00   Median : 6.000   Median : 185  
##  Mean   :1.094   Mean   :15.92   Mean   : 6.167   Mean   : 264  
##  3rd Qu.:1.000   3rd Qu.:21.00   3rd Qu.: 8.000   3rd Qu.: 329  
##  Max.   :2.000   Max.   :31.00   Max.   :12.000   Max.   :3025  
##  NA's   :1324                                                   
##     campaign          pdays           previous          poutcome       
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000   Min.   :-1.00000  
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000   1st Qu.: 0.00000  
##  Median : 2.000   Median : -1.00   Median : 0.0000   Median : 0.00000  
##  Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426   Mean   :-0.07985  
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000   3rd Qu.: 0.00000  
##  Max.   :50.000   Max.   :871.00   Max.   :25.0000   Max.   : 1.00000  
##                                                                        
##        y         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1152  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
## 
hist(bank_numeric$duration, breaks = c(0,10,60,120,300,600,1200,3025))

bank_numeric_lessthan10min <- bank_numeric %>% subset(duration <= 600)
bank_numeric_morethan10min <- bank_numeric %>% subset(duration > 600)
hist(bank_numeric_lessthan10min$duration)

hist(bank_numeric_morethan10min$duration)

boxplot(bank_numeric_lessthan10min$duration)

bank_numeric$y <- as.factor(bank_numeric$y)
x <- bank_numeric %>% subset(select = -c(contact,education,y))
y <- bank_numeric %>% subset(select = c(y))


x_train <- sample_n(x, 0.7*4521)
y_train <- sample_n(y, 0.7*4521)
x_test <- sample_n(x, 0.3*4521)
y_test <- sample_n(y, 0.3*4521)

train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
bank_knn <- knn(train = scale(x_train), 
                test = scale(x_test), 
                cl = y_train$y,
                k = 9)
table(y_test$y, bank_knn)
##    bank_knn
##        0    1
##   0 1212    1
##   1  143    0
error <- mean(y_test$y != bank_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy =  0.893805309734513"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)

svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##         
## svp_pred    0    1
##        0 1212  143
##        1    1    0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
##     FALSE      TRUE 
## 0.1061947 0.8938053
#plot(svp, data = x)
bank.rf <- randomForest(y~., data = train,
                        importance = TRUE,
                        proximity = TRUE)
print(bank.rf)
## 
## Call:
##  randomForest(formula = y ~ ., data = train, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 11.13%
## Confusion matrix:
##      0 1  class.error
## 0 2812 1 0.0003554924
## 1  351 0 1.0000000000
plot(bank.rf)

pred <- predict(bank.rf, x_test)

table(pred, test$y)
##     
## pred    0    1
##    0 1122  130
##    1   91   13
agreement <- pred == test$y
prop.table(table(agreement))
## agreement
##     FALSE      TRUE 
## 0.1629794 0.8370206
importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip() 

importance = importance(bank.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip() 

#Wine Dataset

hist(as.numeric(white_wine$quality))

white_wine$quality <- as.factor(white_wine$quality)
x <- white_wine %>% subset(select = -c(quality))
y <- white_wine %>% subset(select = c(quality))

y <- y %>% mutate(
  quality = case_when(quality == 3 | quality == 4 ~ -1,
                      quality == 5 | quality == 6 | quality == 7 ~ 0, 
                      quality == 8 | quality == 9 ~ 1)
  
)

y$quality <- as.factor(y$quality)
summary(y)
##  quality  
##  -1: 183  
##  0 :4535  
##  1 : 180
x_train <- sample_n(x, 0.7*4898)
y_train <- sample_n(y, 0.7*4898)
x_test <- sample_n(x, 0.3*4898)
y_test <- sample_n(y, 0.3*4898)

train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)
white_wine_knn <- knn(train = scale(x_train), 
                test = scale(x_test), 
                cl = y_train$quality,
                k = 7)
table(y_test$quality, white_wine_knn)
##     white_wine_knn
##        -1    0    1
##   -1    0   53    0
##   0     0 1358    0
##   1     0   58    0
error <- mean(y_test$quality != white_wine_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy =  0.924438393464942"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)

svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##         
## svp_pred   -1    0    1
##       -1    0    0    0
##       0    53 1358   58
##       1     0    0    0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
##      FALSE       TRUE 
## 0.07556161 0.92443839
#plot(svp, data = x)
white_wine.rf <- randomForest(quality~., data = train,
                              ntree = 40,
                              importance = TRUE,
                              proximity = TRUE)
print(white_wine.rf)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train, ntree = 40,      importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 40
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 8.93%
## Confusion matrix:
##    -1    0  1 class.error
## -1  2  124  0  0.98412698
## 0  25 3120 30  0.01732283
## 1   0  127  0  1.00000000
plot(white_wine.rf)

pred <- predict(white_wine.rf, x_test)

table(pred, test$quality)
##     
## pred   -1    0    1
##   -1    1   36    1
##   0    52 1294   56
##   1     0   28    1
agreement <- pred == test$quality
prop.table(table(agreement))
## agreement
##     FALSE      TRUE 
## 0.1177672 0.8822328
importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip() 

importance = importance(white_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip() 

red_wine$quality <- as.factor(red_wine$quality)
x <- red_wine %>% subset(select = -c(quality))
y <- red_wine %>% subset(select = c(quality))

y <- y %>% mutate(
  quality = case_when(quality == 3 | quality == 4 ~ -1,
                      quality == 5 | quality == 6 | quality == 7 ~ 0, 
                      quality == 8 | quality == 9 ~ 1)
  
)

y$quality <- as.factor(y$quality)
summary(y)
##  quality  
##  -1:  63  
##  0 :1518  
##  1 :  18
x_train <- sample_n(x, 0.7*1599)
y_train <- sample_n(y, 0.7*1599)
x_test <- sample_n(x, 0.3*1599)
y_test <- sample_n(y, 0.3*1599)
red_wine_knn <- knn(train = scale(x_train), 
                test = scale(x_test), 
                cl = y_train$quality,
                k = 7)
table(y_test$quality, red_wine_knn)
##     red_wine_knn
##       -1   0   1
##   -1   0  22   0
##   0    0 454   0
##   1    0   3   0
error <- mean(y_test$quality != red_wine_knn)
print(paste("Accuracy = ", 1-error))
## [1] "Accuracy =  0.947807933194154"
x = rbind(as.matrix(x_train))
y = as.matrix(y_train)

svp <- ksvm(x,y,type="C-svc")
svp_pred <- predict(svp, as.matrix(x_test))
table(svp_pred, as.matrix(y_test))
##         
## svp_pred  -1   0   1
##       -1   0   0   0
##       0   22 454   3
##       1    0   0   0
agreement <- svp_pred == as.matrix(y_test)
prop.table(table(agreement))
## agreement
##      FALSE       TRUE 
## 0.05219207 0.94780793
#plot(svp, data = x)
train <- cbind(x_train, y_train)
test <- cbind(x_test, y_test)

red_wine.rf <- randomForest(quality~., data = train,
                            ntree = 30,
                            importance = TRUE,
                            proximity = TRUE)
print(red_wine.rf)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train, ntree = 30,      importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 30
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.79%
## Confusion matrix:
##    -1    0 1 class.error
## -1  0   48 0  1.00000000
## 0  14 1043 2  0.01510859
## 1   0   12 0  1.00000000
plot(red_wine.rf)

pred <- predict(red_wine.rf, x_test)

table(pred, test$quality)
##     
## pred  -1   0   1
##   -1   0   8   0
##   0   22 446   3
##   1    0   0   0
agreement <- pred == test$quality
prop.table(table(agreement))
## agreement
##      FALSE       TRUE 
## 0.06889353 0.93110647
importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseAccuracy'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip() 

importance = importance(red_wine.rf)
var_importance = data.frame(Variables = row.names(importance),
                           Importance =round(importance[, 'MeanDecreaseGini'],2))
rank_importance=var_importance %>%
  mutate(Rank=paste('#',dense_rank(desc(Importance))))
ggplot(rank_importance,aes(x=reorder(Variables,Importance),
 y=Importance,fill=Importance))+ 
 geom_bar(stat='identity') + 
 geom_text(aes(x = Variables, y = 0.5, label = Rank),
 hjust=0, vjust=0.55, size = 4, colour = 'white') +
 labs(x = 'Variables') +
 coord_flip()