# Practical session: Machine learning for computational biology # 2012 # Function to generate a (n, p) matrix of draws from a random variable RandomMatrix <- function( dist, n, p, ... ) { rs <- dist( n*p, ... ) matrix( rs, n, p ) } ## 1 Linear SVM # Function to generate a linearly separable dataset GenerateDatasetLinear <- function( n, p ) { n.pos <- floor( n/2 ) x.pos <- RandomMatrix( rnorm, n.pos, p, mean=0, sd=1 ) x.neg <- RandomMatrix( rnorm, n-n.pos, p, mean=3, sd=1 ) y <- c( rep( 1, n.pos ), rep( -1, n-n.pos ) ) n.train <- floor( 0.8 * n ) idx.train <- sample( n, n.train ) is.train <- rep( 0, n ) is.train[idx.train] <- 1 data.frame( x=rbind( x.pos, x.neg ), y=y, train=is.train ) } # Generate the dataset (150 points in dimension 2) linear.data <- GenerateDatasetLinear( 150, 2 ) # Extract train and test subsets of the dataset linear.train <- linear.data[linear.data$train==1, ] linear.train <- subset( linear.train, select=-train ) linear.test <- linear.data[linear.data$train==0, ] linear.test <- subset( linear.test, select=-train ) # Plot the data require( 'ggplot2' ) qplot( data=linear.data, x.1, x.2, colour=factor(y), shape=factor(train) ) # Train a linear SVM require( 'kernlab' ) linear.svm <- ksvm( y ~ ., data=linear.train, type='C-svc', kernel='vanilladot', C=100, scale=c() ) # Plot the model plot( linear.svm, data=linear.train ) # Predictions for test set linear.prediction <- predict( linear.svm, linear.test ) # Prediction scores linear.prediction.score <- predict( linear.svm, linear.test, type='decision' ) # Compute ROC and Precision-Recall curves require( 'ROCR') linear.roc.curve <- performance( prediction( linear.prediction.score, linear.test$y ), measure='tpr', x.measure='fpr' ) plot( linear.roc.curve ) linear.pr.curve <- performance( prediction( linear.prediction.score, linear.test$y ), measure='prec', x.measure='rec' ) plot( linear.pr.curve ) # Generate a 'cross-shaped' dataset (not linearly separable) GenerateDatasetNonlinear <- function( n, p ) { bottom.left <- RandomMatrix( rnorm, n, p, mean=0, sd=1 ) upper.right <- RandomMatrix( rnorm, n, p, mean=4, sd=1 ) tmp1 <- RandomMatrix( rnorm, n, p, mean=0, sd=1 ) tmp2 <- RandomMatrix( rnorm, n, p, mean=4, sd=1 ) upper.left <- cbind( tmp1[,1], tmp2[,2] ) bottom.right <- cbind( tmp2[,1], tmp1[,2] ) y <- c( rep( 1, 2 * n ), rep( -1, 2 * n ) ) idx.train <- sample( 4 * n, floor( 3.5 * n ) ) is.train <- rep( 0, 4 * n ) is.train[idx.train] <- 1 data.frame( x=rbind( bottom.left, upper.right, upper.left, bottom.right ), y=y, train=is.train ) } nonlinear.data <- GenerateDatasetNonlinear( 100, 2 ) nonlinear.train <- nonlinear.data[nonlinear.data$train==1, ] nonlinear.train <- subset( nonlinear.train, select=-train ) nonlinear.test <- nonlinear.data[nonlinear.data$train==0, ] nonlinear.test <- subset( nonlinear.test, select=-train ) ## Nonlinear SVM # Train a nonlinear SVM nonlinear.svm <- ksvm( y ~ ., data=nonlinear.train, type='C-svc', kernel='rbf', kpar=list(sigma=1), C=100, scale() ) plot( nonlinear.svm, data=nonlinear.train ) ## 3 Application: gene expression data analysis require( 'ALL' ) data( 'ALL' ) all.data <- data.frame( x=t( exprs(ALL) ), y=substr( ALL$BT, 1, 1 ) ) )