# Practical session: Machine learning for computational biology # 2013 # Package installation pkgs <- c( 'ggplot2', 'kernlab', 'ROCR' ) install.packages( pkgs ) source( 'http://bioconductor.org/biocLite.R' ) biocLite( 'ALL' ) # Function to generate a (n, p) matrix of draws from a random variable RandomMatrix <- function( dist, n, p, ... ) { rs <- dist( n*p, ... ) matrix( rs, n, p ) } ## 1 Linear SVM # Function to generate a linearly separable dataset GenerateDatasetLinear <- function( n, p ) { n.pos <- floor( n/2 ) x.pos <- RandomMatrix( rnorm, n.pos, p, mean=0, sd=1 ) x.neg <- RandomMatrix( rnorm, n-n.pos, p, mean=3, sd=1 ) y <- c( rep( 1, n.pos ), rep( -1, n-n.pos ) ) n.train <- floor( 0.8 * n ) idx.train <- sample( n, n.train ) is.train <- rep( 0, n ) is.train[idx.train] <- 1 data.frame( x=rbind( x.pos, x.neg ), y=y, train=is.train ) } # Generate the dataset (150 points in dimension 2) linear.data <- GenerateDatasetLinear( 150, 2 ) # Extract train and test subsets of the dataset linear.train <- linear.data[linear.data$train==1, ] linear.train <- subset( linear.train, select=-train ) linear.test <- linear.data[linear.data$train==0, ] linear.test <- subset( linear.test, select=-train ) # Plot the data require( 'ggplot2' ) qplot( data=linear.data, x.1, x.2, colour=factor(y), shape=factor(train) ) # Train a linear SVM require( 'kernlab' ) linear.svm <- ksvm( y ~ ., data=linear.train, type='C-svc', kernel='vanilladot', C=100, scale=c() ) # Plot the model plot( linear.svm, data=linear.train ) # Predictions for test set linear.prediction <- predict( linear.svm, linear.test ) # Prediction scores linear.prediction.score <- predict( linear.svm, linear.test, type='decision' ) # Compute ROC and Precision-Recall curves require( 'ROCR') linear.roc.curve <- performance( prediction( linear.prediction.score, linear.test$y ), measure='tpr', x.measure='fpr' ) plot( linear.roc.curve ) linear.pr.curve <- performance( prediction( linear.prediction.score, linear.test$y ), measure='prec', x.measure='rec' ) plot( linear.pr.curve ) # Plot the model, interactively adjust C require( 'manipulate' ) manipulate( plot( ksvm( y ~ ., data=linear.train, type='C-svc', kernel='vanilladot', C=2^c.exponent, scale=c() ), data=linear.train ), c.exponent=slider(-10,10) ) CrossValidation <- function( dataset, n.folds=10, ... ) { n <- nrow( dataset ) idx <- split( sample( seq(n) ), seq(n.folds) ) scores <- rep( 0, n ) for( i in seq( n.folds ) ) { model <- ksvm( y ~ ., data=dataset[ -idx[[i]], ], ... ) scores[ idx[[i]] ] <- predict( model, dataset[ idx[[i]], ], type='decision' ) } scores } linear.crossval <- CrossValidation( linear.data, n.folds=10, kernel='vanilladot', C=100 ) plot( performance( prediction( linear.crossval, linear.data$y ), measure='tpr', x.measure='fpr' ) ) # Generate a 'cross-shaped' dataset (not linearly separable) GenerateDatasetNonlinear <- function( n, p ) { bottom.left <- RandomMatrix( rnorm, n, p, mean=0, sd=1 ) upper.right <- RandomMatrix( rnorm, n, p, mean=4, sd=1 ) tmp1 <- RandomMatrix( rnorm, n, p, mean=0, sd=1 ) tmp2 <- RandomMatrix( rnorm, n, p, mean=4, sd=1 ) upper.left <- cbind( tmp1[,1], tmp2[,2] ) bottom.right <- cbind( tmp2[,1], tmp1[,2] ) y <- c( rep( 1, 2 * n ), rep( -1, 2 * n ) ) idx.train <- sample( 4 * n, floor( 3.5 * n ) ) is.train <- rep( 0, 4 * n ) is.train[idx.train] <- 1 data.frame( x=rbind( bottom.left, upper.right, upper.left, bottom.right ), y=y, train=is.train ) } # Extract train and test datasets nonlinear.data <- GenerateDatasetNonlinear( 100, 2 ) nonlinear.train <- nonlinear.data[nonlinear.data$train==1, ] nonlinear.train <- subset( nonlinear.train, select=-train ) nonlinear.test <- nonlinear.data[nonlinear.data$train==0, ] nonlinear.test <- subset( nonlinear.test, select=-train ) ## 2 Nonlinear SVM # Train a nonlinear SVM nonlinear.svm <- ksvm( y ~ ., data=nonlinear.train, type='C-svc', kernel='rbf', kpar=list(sigma=1), C=100, scale=c() ) plot( nonlinear.svm, data=nonlinear.train ) # Plot the model, interactively adjust C and the kernel manipulate( plot( ksvm( y ~ ., data=nonlinear.train, type='C-svc', kernel=k, C=2^c.exponent, scale=c() ), data=nonlinear.train ), c.exponent=slider(-10,10), k=picker('Gaussian'='rbfdot', 'Linear'='vanilladot', 'Hyperbolic'='tanhdot', 'Spline'='splinedot', 'Laplacian'='laplacedot') ) # Compute the error by cross-validation for several values of C BiasVarianceTradeoff <- function( dataset, cross=10, c.seq=2^seq(-10, 10), ... ) { err <- sapply( c.seq, function( c ) { cross( ksvm( y ~ ., data=dataset, C=c, cross=cross, ...) ) }) data.frame( c=c.seq, err ) } # Plot the "error versus C" curve qplot( c, err, data=BiasVarianceTradeoff( subset( nonlinear.data, select=-train ), type='C-svc', kernel='rbfdot' ), geom='line', log='x' ) ## 3 Application: gene expression data analysis require( 'ALL' ) data( 'ALL' ) all.data <- data.frame( x=t( exprs(ALL) ), y=substr( ALL$BT, 1, 1 ) )