##--------------------------------------##
## Practical session: initiation to SVM ##
##                                      ##
## Jean-Philippe Vert, 10/11/2010       ##
##--------------------------------------##

# This file is available at
# http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic.R
# The corresponding note is available at
# http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic_notes.pdf


# Various functions that should be programmed by the students
# Available at http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/general.R
source('general.R')

## Generate data ##
##---------------##

# number of data points
n <- 150
# dimension
p <- 2

sigma <- 2  # variance of the distribution
meanpos <- 0 # centre of the distribution of positive examples
meanneg <- 3 # centre of the distribution of negative examples
npos <- round(n/2) # number of positive examples
nneg <- n-npos # number of negative examples

# Generate the positive and negative examples
xpos <- matrix(rnorm(npos*p,mean=meanpos,sd=sigma),npos,p)
xneg <- matrix(rnorm(nneg*p,mean=meanneg,sd=sigma),npos,p)
x <- rbind(xpos,xneg)

# Generate the labels
y <- matrix(c(rep(1,npos),rep(-1,nneg)))

# Visualize the data
plot(x,col=ifelse(y>0,1,2))
legend("topleft",c('Positive','Negative'),col=seq(2),pch=1,text.col=seq(2))

## Prepare a training and a test set ##
ntrain <- round(n*0.8) # number of training examples
tindex <- sample(n,ntrain) # indices of training samples
xtrain <- x[tindex,]
xtest <- x[-tindex,]
ytrain <- y[tindex]
ytest <- y[-tindex]
istrain=rep(0,n)
istrain[tindex]=1

# Visualize
plot(x,col=ifelse(y>0,1,2),pch=ifelse(istrain==1,1,2))
legend("topleft",c('Positive Train','Positive Test','Negative Train','Negative Test'),col=c(1,1,2,2),pch=c(1,2,1,2),text.col=c(1,1,2,2))


## Train a linear SVM with C=1 ##
##-----------------------------##

# load the kernlab package
library(kernlab)

# train the SVM
svp <- ksvm(xtrain,ytrain,type="C-svc",kernel='vanilladot',C=1,scaled=c())

# General summary
svp

# Attributes that you can access
attributes(svp)

# For example, the support vectors
alpha(svp)
alphaindex(svp)
b(svp)

# Use the built-in function to pretty-plot the classifier
plot(svp,data=xtrain)

# Visualize decision boundary with home-made function (available in general.R)
#pdf('linearsvm.pdf',width=8)
plotlinearsvm2D(svp,xtrain)
#dev.off()

# Predict labels on test
ypred = predict(svp,xtest)
# Confusion table
table(ytest,ypred)

# Compute accuracy
sum(ypred==ytest)/length(ytest)

# Compute at the prediction scores
ypredscore = predict(svp,xtest,type="decision")

# Check that the predicted labels are the signs of the scores
table(ypredscore > 0,ypred)

# Package to compute ROC curve, precision-recall etc...
library(ROCR)

pred <- prediction(ypredscore,ytest)

# Plot ROC curve
perf <- performance(pred, measure = "tpr", x.measure = "fpr") 
plot(perf)

# Plot precision/recall curve
perf <- performance(pred, measure = "prec", x.measure = "rec") 
plot(perf)

# Plot accuracy as function of threshold
perf <- performance(pred, measure = "acc") 
plot(perf)


## Use cross-validation ##
##----------------------##

# We compute the prediction vector by cross-validation
k=5	
ypredscorecv <- cvpred.ksvm(x,y,folds=k,type="C-svc",kernel='vanilladot',C=1,scaled=c(),predtype="decision")

# Check the performance
print(table(ypredscorecv > 0,y))
pred <- prediction(ypredscorecv,y)
perf <- performance(pred, measure = "tpr", x.measure = "fpr") 
plot(perf)

# Estimate the CV error with ksvm directly, and compare
svp <- ksvm(x,y,type="C-svc",kernel='vanilladot',C=1,scaled=c(),cross=5)
print(cross(svp))
print(1-sum((ypredscorecv>0)==(y==1))/n)


## Effect of C ##
##-------------##

# Plot SVM for different values of C
# Observe how the margin changes
clist <- 2^seq(-10,10)
err <- numeric(length(clist))
par(ask=T)
for (i in seq(length(clist))) {
	svp <- ksvm(x,y,type="C-svc",kernel='vanilladot',C=clist[i],scaled=c(),cross=3)
	plotlinearsvm2D(svp,x)
	err[i] <- cross(svp)
}
par(ask=F)
# Plot the CV error as a function of C
plot(clist,err,type='l',log="x",ylim=c(0,1),xlab="C",ylab="Error rate")
grid()


## Nonlinear SVM ##
##---------------##

# Make a toy example

nclust <- 30 # number of points in each cluster
n <- 4*nclust # number of data points
p <- 2   # dimension
sigma <- 0.8  # variance of the distribution
meanpos <- 0 # centre of the distribution of positive examples
meanneg <- 3 # centre of the distribution of negative examples

# Generate the positive and negative examples
x1 <- rnorm(nclust,mean= meanpos,sd=sigma)
x2 <- rnorm(nclust,mean= meanpos,sd=sigma)
x = cbind(x1,x2)
x1 <- rnorm(nclust,mean= meanneg,sd=sigma)
x2 <- rnorm(nclust,mean= meanneg,sd=sigma)
x = rbind(x,cbind(x1,x2))
x1 <- rnorm(nclust,mean= meanpos,sd=sigma)
x2 <- rnorm(nclust,mean= meanneg,sd=sigma)
x = rbind(x,cbind(x1,x2))
x1 <- rnorm(nclust,mean= meanneg,sd=sigma)
x2 <- rnorm(nclust,mean= meanpos,sd=sigma)
x = rbind(x,cbind(x1,x2))
y = c(rep(1,2*nclust),rep(-1,2*nclust))

# Visualize
#pdf('toynonlinear.pdf',width=8)
plot(x,col=ifelse(y>0,1,2),pch=ifelse(y>0,1,2))
legend("topleft",c('Positive','Negative'),col=c(1,2),pch=c(1,2),text.col=c(1,2))
grid()
#dev.off()


# Train linear SVM
clist <- 2^seq(-6,10)
errlin <- numeric(length(clist))
par(ask=T)
for (i in seq(length(clist))) {
	svp <- ksvm(x,y,type="C-svc",kernel='vanilladot',C=clist[i],scaled=c(),cross=3)
	plotlinearsvm2D(svp,x)
	errlin[i] <- cross(svp)
}
par(ask=F)
# Plot the CV error as a function of C
plot(clist,errlin,type='l',log="x",ylim=c(0,1),xlab="C",ylab="Error rate")
grid()

# Train a nonlinear SVM with Gaussian RBF kernel and default parameters
svp <- ksvm(x,y,type="C-svc",kernel='rbf')
#pdf('nonlinearsvm.pdf',width=8)
plot(svp,data=x)
#dev.off()


# Effect of C only with automatic choice of sigma
errrbf <- numeric(length(clist))
par(ask=T)
for (i in seq(length(clist))) {
	svp <- ksvm(x,y,type="C-svc",kernel='rbf',C=clist[i],cross=3)
	plot(svp,data=x)
	errrbf[i] <- cross(svp)
}
par(ask=F)
# Plot the CV error as a function of C
plot(clist,errlin,type='l',log="x",ylim=c(0,1),xlab="C",ylab="Error rate",col=1,lwd=2)
lines(clist,errrbf,col=2,lwd=2)
grid()
legend("topright",c('Linear','RBF Gaussian'),lwd=2,col=c(1,2))

# Effect of C and sigma
nc <- length(clist)
sigmalist <- 2^seq(-6,6)
nsigma <- length(sigmalist)
err <- matrix(0,nrow=nc,ncol=nsigma)
for (i in seq(nc)) {
	C <- clist[i]
	cat('C=',C)
	for (j in seq(nsigma)) {
		cat('.')
		svp <- ksvm(x,y,type="C-svc",kernel='rbf',kpar=list(sigma=sigmalist[j]),C=C,cross=3)
		err[i,j] <- cross(svp)
	}
	cat('\n')
}
# Visualize
library(lattice)
dimnames(err) <- list(C=clist,sigma=sigmalist)
levelplot(err,scales=list(x=list(rot=90)),xlab="C",ylab=expression(sigma),main="Error rate")

## Application on  ALL gene expression data ##
##------------------------------------------##

# Load the ALL dataset
library(ALL)
data(ALL)

# Inspect them
?ALL
show(ALL)
print(summary(pData(ALL)))

# Look at type and stage of each patient
print(ALL$BT)

# Predict type of disease from expression data
y <- substr(ALL$BT,1,1)
x <- t(exprs(ALL))
dim(x)
length(y)

# Train a linear SVM
ypred <- cvpred.ksvm(x,y,type="C-svc",kernel='vanilladot',C=1)
print(table(ypred,y))

# Train a nonlinear SVM
ypred <- cvpred.ksvm(x,y,type="C-svc",kernel='rbf',C=1)
print(table(ypred > 0,y))

# Check the influence of C
clist <- 2^seq(-6,10)
errlin <- numeric(length(clist))
errrbf <- numeric(length(clist))
for (i in seq(length(clist))) {
	cat('.')
	svp <- ksvm(x,y,type="C-svc",kernel='vanilladot',C=clist[i],cross=3)
	errlin[i] <- cross(svp)
	svp <- ksvm(x,y,type="C-svc",kernel='rbf',C=clist[i],cross=3)
	errrbf[i] <- cross(svp)
}

# Plot the CV error as a function of C
plot(clist,errlin,type='l',log="x",ylim=c(0,0.5),xlab="C",ylab="Error rate",col=1,lwd=2,main='ALL dataset')
lines(clist,errrbf,col=2,lwd=2)
grid()
legend("topright",c('Linear','RBF Gaussian'),lwd=2,col=c(1,2))