################################################################################################################### ######## A simulation study based on the Iris dataset for comparing several classification methods ######### ######## The results are given in (the new) Section 6 of the paper ######### ################################################################################################################### getwd() setwd("\\\\filestore.soton.ac.uk/users/wl/mydocuments/mydocuments/MyResearch/My papers/inProgress/Classification/2018-8/R code") getwd() #To make sure that the dataset is in the right folder/directory iris<-read.csv("iris.csv",head=T) # Variables used = Sepal length and width irism<-iris[,c(1,2,5)] attach(irism) sigma1<-cov(irism[Class=="Iris-setosa",1:2]) sigma2<-cov(irism[Class=="Iris-versicolor",1:2]) sigma3<-cov(irism[Class=="Iris-virginica",1:2]) #### Select one of the three cases below to run #### ## Case 1: All population distributions are close to each other mu1<-colMeans(irism[Class=="Iris-setosa",1:2])+c(0,-0.5) mu2<-colMeans(irism[Class=="Iris-versicolor",1:2]) mu3<-colMeans(irism[Class=="Iris-virginica",1:2]) ## Case 2: 2 population distributions are close + 1 is reasonablely Separate mu1<-colMeans(irism[Class=="Iris-setosa",1:2]) mu2<-colMeans(irism[Class=="Iris-versicolor",1:2]) mu3<-colMeans(irism[Class=="Iris-virginica",1:2]) ## Case 3: All 3 population distributions are reasonablely Separate mu1<-colMeans(irism[Class=="Iris-setosa",1:2]) mu2<-colMeans(irism[Class=="Iris-versicolor",1:2]) mu3<-colMeans(irism[Class=="Iris-virginica",1:2])+c(1,0.5) #### The function for the confidence-set classifier #### classifierCS<-function(x,data,lambda) # x: the vector of measurements for new case # data: the training dataset matrix with the Class given in the 1st column # alpha: 1 - alpha is the proportion of correct classification of all future cases # lambda: the critical constant value for classification # (=9.175 for Q=10000, S=10000, p=2,c=3, ni=50) # which is computed using the program ConfidenceSetClassifier.R Part 2 { p<- length(x) #group1 Mx1<-data[data[,1]==1,2:(p+1)] mu1<-apply(Mx1,2,mean) sigma1<-cov(Mx1) N1<-dim(Mx1)[1] #group2 Mx2<-data[data[,1]==2,2:(p+1)] mu2<-apply(Mx2,2,mean) sigma2<-cov(Mx2) N2<-dim(Mx2)[1] #group3 Mx3<-data[data[,1]==3,2:(p+1)] mu3<-apply(Mx3,2,mean) sigma3<-cov(Mx3) N3<-dim(Mx3)[1] A1<- t(x-mu1)%*%solve(sigma1)%*%(x-mu1)-lambda A2<- t(x-mu2)%*%solve(sigma2)%*%(x-mu2)-lambda A3<- t(x-mu3)%*%solve(sigma3)%*%(x-mu3)-lambda if (A1>=0 & A2>=0 & A3>=0) {set<-9} #{set<-"{ }"}, i.e. the set is empty if (A1<0 & A2>=0 & A3>=0) {set<-1} #{set<-"{1}"}, i.e. the set contains 1 only if (A1>=0 & A2<0 & A3>=0) {set<-2} #{set<-"{2}"}, i.e. the set contains 2 only if (A1>=0 & A2>=0 & A3<0) {set<-3} #{set<-"{3}"}, i.e. the set contains 3 only if (A1<0 & A2<0 & A3>=0) {set<-12} #{set<-"{1,2}"}, i.e. the set contains 1 and 2 only if (A1<0 & A2>=0 & A3<0) {set<-13} #{set<-"{1,3}"}, i.e. the set contains 1 and 3 only if (A1>=0 & A2<0 & A3<0) {set<-23} #{set<-"{2,3}"}, i.e. the set contains 2 and 3 only if (A1<0 & A2<0 & A3<0) {set<-123} #{set<-"{1,2,3}"}, i.e. the set contains 1 and 2 and 3 set } set.seed(101) library(MASS) # simulate multivariate normal alpha<-0.05 sim<-100 # number of simulation set cases<-1000 # number of new cases for each class lambda<-9.175 # This critical constant is computed using ConfidenceSetClassifier.R (Part 2) # 9.257609 (1000*1000) seeds 101 # 9.187527 (10000*10000) seeds 101 # 8.18818 ((1000*1000) gamma=.80) ##################################################### # vector of % correct classified by each method # # for each simulation set # ##################################################### library("tree") library("nnet") library("e1071") probconf<-c() probtree<-c() problogit<-c() probbayes<-c() probsvm<-c() sizeconf<-c() # Start the clock! ptm <- proc.time() # One loop i for each simulated training dataset for (j in 1:sim) { ############################################################################# ## simulate the training data for building the classifiers ## ############################################################################# y1<-mvrnorm(n = 50, mu1, sigma1) y2<-mvrnorm(n = 50, mu2, sigma2) y3<-mvrnorm(n = 50, mu3, sigma3) yclass<-as.data.frame(rbind(y1,y2,y3)) yclass$Class<-as.factor(c(rep(1,50),rep(2,50),rep(3,50))) data<-cbind(yclass$Class,yclass$SepalLength,yclass$SepalWidth) ################################################################################################### ## simulate 3000 future cases, 1000 from each distribution ## ################################################################################################### y1<-mvrnorm(n = cases, mu1, sigma1) y2<-mvrnorm(n = cases, mu2, sigma2) y3<-mvrnorm(n = cases, mu3, sigma3) ycase<-as.data.frame(rbind(y1,y2,y3)) ycase$Class<-c(rep(1,cases),rep(2,cases),rep(3,cases)) ny<-nrow(ycase) #################################################### ## Classification by Classification Tree ## #################################################### #library("tree") modtree<-tree(Class~SepalLength+SepalWidth,data=yclass) ##An automatic selection of the best set of variables is involved #plot(modtree) #text(modtree) ################################################################ ## Classification by Multinomial Logistic regression ## ################################################################ #library("nnet") modlogit<-multinom(Class~SepalLength+SepalWidth,data=yclass) ##No selection of the best set of variables is involved #modlogit<-step(mod1) ############################### ## Bayes Classifier ## ############################### #library("e1071") modbayes<-naiveBayes(Class~SepalLength+SepalWidth,data=yclass) ##No selection of the best set of variables is involved ############################### ## Classification by SVM ## ############################### #library(e1071) y <- yclass$Class x <- subset(yclass,select=c(SepalLength,SepalWidth)) modsvm <- svm(x, y) ##An automatic selection of the best kernal is involved ########################################################################### ## Confidence Set method is given by the function classifierCS above ## ########################################################################### ## applying all classifiers to each new case for (i in 1:ny) { ycase$conf[i]<-classifierCS(c(ycase$SepalLength[i],ycase$SepalWidth[i]),data,lambda=lambda) ycase$tree[i]<-predict(modtree,newdata=ycase[i,1:2],type="class") ycase$logit[i]<-predict(modlogit,newdata=ycase[i,1:2],type="class") ycase$bayes[i]<-predict(modbayes,type="class",newdata=ycase[i,1:2]) ycase$svmm[i]<- predict(modsvm, ycase[i,1:2],type="class") ####### for Calculate the size of confidence set if (ycase$conf[i]==9){ycase$size[i]<-0} if (ycase$conf[i]<9){ycase$size[i]<-1} if (ycase$conf[i]>9 && ycase$conf[i]<123 ){ycase$size[i]<-2} if (ycase$conf[i]==123){ycase$size[i]<-3} } ####################################################### #### Calculate prob.of correct classified ######### ####################################################### ##### Confidence Set Method ##### ##### Selected correct classified correct1 <- nrow(subset(ycase, (Class==1 & conf==1)|(Class==1 & conf==12)|(Class==1 & conf==13)|(Class==1 & conf==123))) correct2 <- nrow(subset(ycase, (Class==2 & conf==2)|(Class==2 & conf==12)|(Class==2 & conf==23)|(Class==2 & conf==123))) correct3 <- nrow(subset(ycase, (Class==3 & conf==3)|(Class==3 & conf==13)|(Class==3 & conf==23)|(Class==3 & conf==123))) ####### Calculate the probability probconf[j]<-(correct1+correct2+correct3)/ny ####### Calculate the size of confidence set sizeconf[j]<-mean(ycase$size) ##### Classification Tree ## ##### Selected correct classified correct1 <- nrow(subset(ycase, (Class==1 & tree==1))) correct2 <- nrow(subset(ycase, (Class==2 & tree==2))) correct3 <- nrow(subset(ycase, (Class==3 & tree==3))) ####### Calculate the probability probtree[j]<-(correct1+correct2+correct3)/ny ##### Logistic Regression ## ##### Selected correct classified correct1 <- nrow(subset(ycase, (Class==1 & logit==1))) correct2 <- nrow(subset(ycase, (Class==2 & logit==2))) correct3 <- nrow(subset(ycase, (Class==3 & logit==3))) ####### Calculate the probability problogit[j]<-(correct1+correct2+correct3)/ny ##### Bayesian Method ## ##### Selected correct classified correct1 <- nrow(subset(ycase, (Class==1 & bayes==1))) correct2 <- nrow(subset(ycase, (Class==2 & bayes==2))) correct3 <- nrow(subset(ycase, (Class==3 & bayes==3))) ####### Calculate the probability probbayes[j]<-(correct1+correct2+correct3)/ny ##### Support Vector Machine ## ##### Selected correct classified correct1 <- nrow(subset(ycase, (Class==1 & svmm==1))) correct2 <- nrow(subset(ycase, (Class==2 & svmm==2))) correct3 <- nrow(subset(ycase, (Class==3 & svmm==3))) ####### Calculate the probability probsvm[j]<-(correct1+correct2+correct3)/ny ## waiting time indication chart pie(c(j,sim-j),radius=1,clockwise=T) } # Stop the clock time<-proc.time() - ptm time sim cases mu1 mu2 mu3 mean(sizeconf) ## \bar{M} in Table 2 for CS classifier sd(sizeconf) sum(probconf>(1-alpha))/sim ## This gives the \hat{\gamma} in Table 2 mean(probconf) ## This gives the \bar{p} in Table 2 sd(probconf) sum(probtree>(1-alpha))/sim ## This gives the \hat{\gamma} for Tree Classifier mean(probtree) sd(probtree) sum(problogit>(1-alpha))/sim ## This gives the \hat{\gamma} mean(problogit) sd(problogit) sum(probbayes>(1-alpha))/sim ## This gives the \hat{\gamma} mean(probbayes) sd(probbayes) sum(probsvm>(1-alpha))/sim ## This gives the \hat{\gamma} mean(probsvm) sd(probsvm) table(ycase$Class,ycase$conf,dnn=c("Observed","Predicted")) table(ycase$Class,ycase$tree,dnn=c("Observed","Predicted")) table(ycase$Class,ycase$logit,dnn=c("Observed","Predicted")) table(ycase$Class,ycase$bayes,dnn=c("Observed","Predicted")) table(ycase$Class,ycase$svmm,dnn=c("Observed","Predicted")) # plot proportion of correct classification over the "sim" simulations for all the classification methods plot(probconf,ylim=c(0.6,1),main=paste("Simulation =", sim, ", New cases =",cases, " per class", sep=" "), xlab ="",ylab ="") points(probtree,col="red") points(problogit,col="green") points(probbayes,col="blue") points(probsvm,col="yellow") legend("bottomright", inset=.01, title="Classification Methods", c("Confidence Set Method","Classification Tree","Logistic Regression","Bayesian Method","Support Vector Machine"), col=c("black","red","green","blue","yellow"),pch=1,cex=0.8)