##-----------------------------------## ## Practical session: string kernels ## ## ## ## Jean-Philippe Vert, 10/11/2010 ## ##-----------------------------------## # This file is available at # http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/stringkernels/stringkernels.R # The corresponding note is available at # http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/stringkernels/stringkernels_notes.pdf ## Basic string kernels ## ##----------------------## # We use the kernlab toolbox where several string kernels are implemented # library(kernlab) # Look at the descriptions and explanations help(stringdot) # Example: a simple spectrum kernel sk <- stringdot(type="spectrum", length=2, normalized=FALSE) sk('radar','abracadabra') # Note the unexpected behavior sk('aa','aa') # should be 1 but is 2 sk('aa','ba') # should be 0 but is 1 sk('aa','ab') # should be 0 and is 0 # This is because the strings contain one additional hidden character at the end! # Note that all kernels created by stringdots only extract substrings without blocks # To consider gappy substrings, we use another package library(stringkernels) # Example: a simple gappy substring kernels sgk <- gapweightkernel(length=2,lambda=0.75,normalized=FALSE,use_characters=TRUE) # Note that contrary to the spectrum kernel above, this one only computes a kernel on the characters you see sgk('aa','aa') # should be 1 ok sgk('aa','aaa') # should be 1*(1+1+0.75) ok ## Application to Reuter datasets ## ##--------------------------------## # Load the data data(reuters) y <- rlabels x <- reuters # Visualize with kPCA for a particular kernel sk <- stringdot(type="spectrum", length=2, normalized=FALSE) kpc <- kpca(x,kernel=sk,scale=c()) plot(rotated(kpc),col=ifelse(y==levels(y)[1],1,2)) # Note that it is usually useful to normalize the kernel to remove the length effect sk <- stringdot(type="spectrum", length=2, normalized=TRUE) kpc <- kpca(x,kernel=sk,scale=c()) plot(rotated(kpc),col=ifelse(y==levels(y)[1],1,2)) # The gappy kernel... sgk <- gapweightkernel(length=2,lambda=0.75,normalized=TRUE,use_characters=TRUE) kpc <- kpca(x,kernel=sgk,scale=c()) plot(rotated(kpc),col=ifelse(y==levels(y)[1],1,2)) # Train a SVM and estimate CV error with different kernels # First we focus on spectrum and bounded kernels with different k kmax <- 20 errspectrum <- numeric(kmax) errboundrange <- numeric(kmax) for (k in seq(kmax)) { cat('.') sk <- stringdot(type="spectrum", length=k, normalized=TRUE) svp <- ksvm(x,y,kernel=sk,scale=c(),cross=5) errspectrum[k] <- cross(svp) sk <- stringdot(type="boundrange", length=k, normalized=TRUE) svp <- ksvm(x,y,kernel=sk,scale=c(),cross=5) errboundrange[k] <- cross(svp) } # Look at the results plot(c(1,kmax),c(0,1),type='n',xlab="k",ylab='error') lines(errspectrum,col=1) lines(errboundrange,col=2) grid() legend("topleft",c('Spectrum','Bounded'),lty=1,col=seq(2)) # Test the exponential kernel with different lambdas lambdaseq <- 1.1^seq(10) nlambda <- length(lambdaseq) errexponential <- numeric(nlambda) for (i in seq(nlambda)) { cat('.') sk <- stringdot(type="exponential", lambda=lambdaseq[i], normalized=TRUE) svp <- ksvm(x,y,kernel=sk,scale=c(),cross=5) errexponential[i] = cross(svp) } plot(c(1,max(lambdaseq)),c(0,1),type='n',xlab="lambda",ylab='error') lines(lambdaseq,errexponential) grid() ## Protein localization ## ##----------------------## # We need a library to parse sequence files in FASTA format library(seqinr) # Load localization data from http://www.psort.org/dataset/dataset1_0.txt protdata <- read.fasta("dataset1_0.txt",seqtype="AA",as.string=TRUE) # Extract protein localization information annotation <- getName(protdata) # We get the location information by parsing the annotation as follows extractlocationfromannotation <- function(s){strsplit(s,'|',fixed=TRUE)[[1]][3]} y <- factor(unlist(lapply(annotation,extractlocationfromannotation))) # Extract the protein sequences x <- unlist(getSequence(protdata,as.string=TRUE),recursive=FALSE) ## Discriminate membrane (inner or outer) vs non-membrane proteins ## # We get the label as follows ymembrane <- factor((y=="Inner") | (y=="Outer")) # Take a random sample of size 100 for training, a random sample of size 100 for test ntrain <- 100 ntest <- 100 itrain <- sample(length(x),ntrain) itest <- sample(seq(length(x))[-itrain],ntest) # Test spectrum kernel with various values for k and C klist <- seq(5) clist <- 3^seq(-1,6) err <- matrix(0,length(klist),length(clist)) # Loop over k for (ik in seq(length(klist))) { k <- klist[ik] cat('k=',k) # kernel definition sk <- stringdot(type="spectrum", length=k, normalized=TRUE) # Pre-compute the kernel matrices (this will be much faster after) cat('.') Ktrain <- kernelMatrix(sk,x[itrain]) cat('.') Ktest <- kernelMatrix(sk,x[itest],x[itrain]) ytrain <- ymembrane[itrain] # Loop over C for (ic in seq(length(clist))) { cat('.') # Train a SVM on training set svp <- ksvm(Ktrain,ytrain,type="C-svc",C=clist[ic]) # Predict on test set ypred <- predict(svp,as.kernelMatrix(Ktest[,SVindex(svp)])) # Store accuracy err[ik,ic] <- sum(ypred != ymembrane[itest]) } cat('\n') } err <- err/ntest plot(c(min(clist),max(clist)),c(0,0.5),log="x",type='n', xlab="C",ylab="Error",main="Spectrum kernel") nk <- length(klist) for (ik in seq(nk)) { lines(clist,err[ik,],col=ik,lwd=2) } legend("topright",paste("k =",seq(nk)),col=seq(nk),lwd=rep(2,nk)) grid(col='darkgray') ## Example of multiclass classification ## sk <- stringdot(type="spectrum", length=2, normalized=TRUE) Ktrain <- kernelMatrix(sk,x[itrain]) ytrain <- y[itrain] m <- ksvm(Ktrain,ytrain,type="C-svc") ypred <- predict(m,as.kernelMatrix(Ktest[,SVindex(m)])) table(ypred,y[itest]) ypred <- predict(m,Ktest)