## Practical session: Linear classification
## Jean-Philippe.Vert@mines.org
##
## In this practical session to test several linear methods for classification (LDA, logistic regression)
##
## Packages needed: MASS, glmpath, e1071


set.seed(111) # random number generator

####################################
## Prepare data
####################################

# Download South African heart disease data
hr <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)

# Pretty plot
pairs(hr[1:9],pch=21,bg=c("red","green")[factor(hr$chd)])

# Scale data and create a random train / test split
n <- nrow(hr)
p <- ncol(hr)-1
test.ratio <- .2 # ratio of test/train samples
n.test <- ceiling(n*test.ratio)
testi <- sample(1:n,n.test)
traini <- setdiff(1:n,testi)

data.train <- hr[traini,]
data.test <- hr[testi,]


####################################
## LDA
####################################
library(MASS)

# Train the model
hr.lda <- lda(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train )
hr.lda

# Evaluate performance
ypred.train <- predict(hr.lda,data.train)$class
ypred.test <- predict(hr.lda,data.test)$class
# Training error
table(ypred.train , data.train$chd)
mean(ypred.train == data.train$chd)
# Test error
table(ypred.test , data.test$chd)
mean(ypred.test == data.test$chd)
# LOO error
hr.lda.loo <- lda(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train , CV=TRUE)
table(hr.lda.loo$class , data.train$chd)
mean(hr.lda.loo$class == data.train$chd)


####################################
## QDA (not a linear method!)
####################################

hr.qda <- qda(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train )
hr.qda

# Evaluate performance
ypred.train <- predict(hr.qda,data.train)$class
ypred.test <- predict(hr.qda,data.test)$class
# Training error
table(ypred.train , data.train$chd)
mean(ypred.train == data.train$chd)
# Test error
table(ypred.test , data.test$chd)
mean(ypred.test == data.test$chd)
# LOO error
hr.qda.loo <- qda(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train , CV=TRUE)
table(hr.qda.loo$class , data.train$chd)
mean(hr.qda.loo$class == data.train$chd)

####################################
## Logistic regression
####################################

# Fit a logistic regression model on all variable to reproduce Table 4.2
hr.lr <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)

summary(hr.lr)

# Iteratively remove the least significant variable until all are significant (Table 4.3)
summary(glm(chd ~ sbp + tobacco + ldl + famhist + obesity + age , family = binomial , data=hr))
summary(glm(chd ~ tobacco + ldl + famhist + obesity + age , family = binomial , data=hr))
summary(glm(chd ~ tobacco + ldl + famhist + age , family = binomial , data=hr))

# Alternatively, do it automatically 
library(MASS)
hr.step <- stepAIC(hr.lr)
summary(hr.step)

# Train the full model and the reduced model on the training set, predict on test set
hr.lr.full <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=data.train)
ypred <- predict(hr.lr.full,data.test)>0
table(ypred , data.test$chd)
mean(ypred == data.test$chd)

hr.lr.small <- stepAIC(hr.lr.full)
ypred <- predict(hr.lr.small,data.test)>0
table(ypred , data.test$chd)
mean(ypred == data.test$chd)


####################################
## L1 regularized logistic regression
####################################

library(glmpath)

# we need to define x and y to use glmpath
x <- hr[,c(1,2,3,5,7,8,9)]
x$famhist <- as.numeric(x$famhist)
x <- as.matrix(x)
y <- hr[,10]

# Train the model
hr.glmpath <- glmpath(x , y , family = binomial)

# Visualize the model (Fig. 4.13)
plot(hr.glmpath)

# Cross-validation
a <- cv.glmpath(x , y , family = binomial , type="response")
plot(a$fraction,a$cv.error)


r <- cv.glmpath(x[traini,] , y[traini] , family = binomial , type="response")
plot(r$fraction,r$cv.error)
bestfraction <- r$fraction[which.min(r$cv.error)]
# Retrain a model on the training set
hr.glmpath <- glmpath(x[traini,] , y[traini] , family = binomial)
# Predict (with the regularization parameter optimized on the training set)
ypred <- predict(hr.glmpath,x[testi,] , s=bestfraction , mode="norm.fraction")>0
table(ypred , data.test$chd)
mean(ypred == data.test$chd)


####################################
## SVM
####################################

library(e1071)

# Train a SVM with default parameters
hr.svm <- svm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train , type="C-classification")
summary(hr.svm)

# Predict
ypred <- predict(hr.svm , data.test)
mean(ypred == data.test$chd)

# Linear SVM
hr.svm <- svm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , data=data.train , type="C-classification" , kernel = "linear" , cost=0.1)
ypred <- predict(hr.svm , data.test)
mean(ypred == data.test$chd)