##----------------------------------------------------------## ## Practical session: reconstruction of regulatory networks ## ## ## ## Jean-Philippe Vert, 10/11/2010 ## ##----------------------------------------------------------## # This file is available at # http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/reginference.R # The corresponding note is available at # http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/reginference_notes.pdf ## Load data ## ##-----------## # Download the data from # http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/data.tar.gz # then tar xvzf data.tar.gz # Load expression data exp <- as.matrix(read.table('expression.txt')) # Normalize each gene expression profile to mean 0 and standard deviation 1 exp <- t(scale(t(exp))) # Load regulation data reg <- read.table('regulation.txt') # Load gene names gname <- as.matrix(read.table('gene_names.txt')) # List of tf and targets tf <- sort(unique(reg[,1])) targets <- sort(unique(reg[,2])) ## For this practical session, we reduce the number of targets to avoid long computation time. In general this is not required. ## # Take only 200 targets randomly among all targets ntargets <- 500 targets <- targets[sample(length(targets),ntargets)] # Restrict regulations to those targets reg <- reg[!is.na(match(reg[,2],targets)),] nreg <- dim(reg)[1] # Restrict tf to those targeting those targets tf <- sort(unique(reg[,1])) ntf <- length(tf) # Detect tf in targets tfintargets <- intersect(tf,targets) # Indices of the possible self-regulations (target=tf) in the tf*targets matrix selfregindex <- match(tfintargets,tf) + ntf*(match(tfintargets,targets)-1) cat('After reduction: ',ntf,'transcription factors target',ntargets,'gene through',nreg,'regulations\n') # Prepare network files for visualization in Cytoscape ## netgenes<- sort(unique(c(tf,targets))) write.table(reg,file="myedges.txt",row.names=FALSE,col.names=FALSE) write.table(cbind(netgenes,gname[netgenes]),"mygenenames.txt",quote=FALSE,row.names=FALSE,col.names=FALSE) # You can now look at your network in Cytoscape by importing these files ## Assessing the performance ## ##---------------------------## library(ROCR) # We make the ntf*ntargets binary matrix of known regulations truereg <- matrix(0,ntf,ntargets) for (i in seq(nreg)) { truereg[match(reg[i,1],tf),match(reg[i,2],targets)]=1 } # Function to assess the performance of a similarity matrix allind <- ntf * ntargets assessperf <- function(simscore,exclude=NULL) { # simscore should be a ntf*ntargets matrix of score # exclude is a list of indices in the matrix (between 1 and ntf*ntargets) to exclude from the analysis ind <- setdiff(seq(allind),exclude) pred <- prediction(simscore[ind],truereg[ind]) roc <- performance(pred, measure = "tpr", x.measure = "fpr") pre <- performance(pred, measure = "prec", x.measure = "rec") return(list(roc=roc,pre=pre)) } # For example, performance of random prediction score_rand <- matrix(rnorm(ntf*ntargets),ntf,ntargets) perf_rand <- assessperf(score_rand) # Plot ROC and precision-recall curves plot(perf_rand$roc,main='Random prediction') grid(col='darkgray') plot(perf_rand$pre,ylim=c(0,1),main='Random prediction') grid(col='darkgray') ## De novo predictions with similarity-based methods ## ##---------------------------------------------------## library(bioDist) ## Euclidean distance based similarity ## simEuc <- matrix(0,ntf,ntargets) for (itf in seq(ntf)){ cat('.') for (itarget in seq(ntargets)) { simEuc[itf,itarget] <- 1-euc(exp[c(targets[itarget],tf[itf]),]) } } # Assess performance (excluding self-regulation) perf_Euc <- assessperf(simEuc,selfregindex) # Plot performance against random pdf(file="roc_euc_ecoli.pdf") plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2) lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2) legend("topleft",c('Random','Euclidean'),col=seq(2),lwd=2) grid(col='darkgray') grid(col='black') dev.off() pdf(file="pre_euc_ecoli.pdf") plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2) lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2) grid(col='black') legend("topright",c('Random','Euclidean'),col=seq(2),lwd=2) dev.off() ## Pearson-correlation based similarity ## simPearson <- matrix(0,ntf,ntargets) for (itf in seq(ntf)){ cat('.') for (itarget in seq(ntargets)) { simPearson[itf,itarget] <- 1-cor.dist(exp[c(targets[itarget],tf[itf]),]) } } # Assess performance (excluding self-regulation) perf_Pearson <- assessperf(simPearson,selfregindex) ## Spearman-correlation based similarity ## simSpearman <- matrix(0,ntf,ntargets) for (itf in seq(ntf)){ cat('.') for (itarget in seq(ntargets)) { simSpearman[itf,itarget] <- 1-spearman.dist(exp[c(targets[itarget],tf[itf]),]) } } # Assess performance (excluding self-regulation) perf_Spearman <- assessperf(simSpearman,selfregindex) ## Mutual-information based similarity ## simMI <- matrix(0,ntf,ntargets) for (itf in seq(ntf)){ cat('.') for (itarget in seq(ntargets)) { simMI[itf,itarget] <- 1-MIdist(exp[c(targets[itarget],tf[itf]),]) } } # Assess performance (excluding self-regulation) perf_MI <- assessperf(simMI,selfregindex) # Plot all ROC curves pdf(file="roc_cor_ecoli.pdf") plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2) lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2) lines(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=3,lwd=2) lines(perf_Spearman$roc@x.values[[1]],perf_Spearman$roc@y.values[[1]],type='l',col=4,lwd=2) lines(perf_MI$roc@x.values[[1]],perf_MI$roc@y.values[[1]],type='l',col=5,lwd=2) legend("topleft",c('Randome','Euclidean','Pearson','Spearman','MI'),col=seq(5),lwd=2) grid(col='black') dev.off() # Plot all Precision-Recall curves pdf(file="pre_cor_ecoli.pdf") plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2) lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2) lines(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=3,lwd=2) lines(perf_Spearman$pre@x.values[[1]],perf_Spearman$pre@y.values[[1]],type='l',col=4,lwd=2) lines(perf_MI$pre@x.values[[1]],perf_MI$pre@y.values[[1]],type='l',col=5,lwd=2) legend("topright",c('Random','Euclidean','Pearson','Spearman','MI'),col=seq(5),lwd=2) grid(col='black') dev.off() ## De novo inference with feature selection methods ## ##--------------------------------------------------## ## With LARS + stability selection source('stabilityselection.R') score_lars <- matrix(0,ntf,ntargets) nbootstrap <- 500 nstepsLARS <- 5 for (itarget in seq(ntargets)) { cat('.') curtarget <- targets[itarget] # Find the TF to be used for prediction (all TF except the target if the target is a TF) predictorTFid <- !match(tf,curtarget,nomatch=0) # extract the variable for regression y <- exp[curtarget,] x <- exp[tf[predictorTFid],] # stability selection r <- stabilityselection(t(x),y,nbootstrap=nbootstrap,nsteps=nstepsLARS,plotme=FALSE) score_lars[predictorTFid,itarget] <- r } perf_lars <- assessperf(score_lars,selfregindex) ## With random forests library(randomForest) score_rf <- matrix(0,ntf,ntargets) for (itarget in seq(ntargets)) { cat('.') curtarget <- targets[itarget] # Find the TF to be used for prediction (all TF except the target if the target is a TF) predictorTFid <- !match(tf, curtarget,nomatch=0) # extract the variable for regression y <- exp[curtarget,] x <- exp[tf[predictorTFid],] # stability selection r <- randomForest(t(x),y) score_rf[predictorTFid,itarget] <- r$importance } perf_rf <- assessperf(score_rf,selfregindex) # Plot all ROC curves pdf(file="roc_ecoli.pdf") plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2) lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2) lines(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=3,lwd=2) lines(perf_Spearman$roc@x.values[[1]],perf_Spearman$roc@y.values[[1]],type='l',col=4,lwd=2) lines(perf_MI$roc@x.values[[1]],perf_MI$roc@y.values[[1]],type='l',col=5,lwd=2) lines(perf_lars$roc@x.values[[1]],perf_lars$roc@y.values[[1]],type='l',col=6,lwd=2) lines(perf_rf$roc@x.values[[1]],perf_rf$roc@y.values[[1]],type='l',col=7,lwd=2) legend("topleft",c('Randome','Euclidean','Pearson','Spearman','MI','Lasso','RF'),col=seq(7),lwd=2) grid(col='black') dev.off() # Plot all Precision-Recall curves pdf(file="pre_ecoli.pdf") plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2) lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2) lines(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=3,lwd=2) lines(perf_Spearman$pre@x.values[[1]],perf_Spearman$pre@y.values[[1]],type='l',col=4,lwd=2) lines(perf_MI$pre@x.values[[1]],perf_MI$pre@y.values[[1]],type='l',col=5,lwd=2) lines(perf_lars$pre@x.values[[1]],perf_lars$pre@y.values[[1]],type='l',col=6,lwd=2) lines(perf_rf$pre@x.values[[1]],perf_rf$pre@y.values[[1]],type='l',col=7,lwd=2) legend("topright",c('Random','Euclidean','Pearson','Spearman','MI','Lasso','RF'),col=seq(7),lwd=2) grid(col='black') dev.off() # Save results save(perf_rand,perf_Euc,perf_Pearson,perf_Spearman,perf_MI,perf_lars,perf_rf,file="perf.Rdata") # Extract the predictions at 50% precision threshold <- perf_Pearson$pre@alpha.values[[1]][max(which(perf_Pearson$pre@y.values[[1]]>0.5))] simPersonwithoutselfreg <- simPearson simPersonwithoutselfreg[selfregindex] <- 0 predpairs <- which(simPersonwithoutselfreg>=threshold,arr.ind=TRUE) predreg <- cbind(tf[predpairs[,1]] , targets[predpairs[,2]]) write.table(predreg,file="myprededges.txt",row.names=FALSE,col.names=FALSE) # We can now look at them in Cytoscape! ## Supervised inference ## ##----------------------## ## Data preparation ## # We assume that half the edges are known, half are unknown trainregindex <- sample(nreg,nreg*0.8) trainreglist <- reg[trainregindex,] trainregindexinmatrix <- match(trainreglist[,1],tf) + ntf*(match(trainreglist[,2],targets)-1) # Make a ntf*ntargets matrix of binary labels from the training regulations trainreg <- matrix(-1,ntf,ntargets) trainreg[trainregindexinmatrix] <- 1 ## SIRENE scores (using a SVM) ## ## Note that for sake of simplicity we implement below a very simple method: ## train a SVM to discriminate known regulated genes from unknown ones, and use ## the training SVM score on the unknown ones to predict missing regulations. ## Better methods for learning from positive and unlabeled examples are possible. require(kernlab) # By default we give a very bad score (eg, if no training gene is available for a tf and we do not even train a SVM) score_sirene <- matrix(-10,ntf,ntargets) # Extract the expression matrix for all target genes x <- exp[targets,] # Consider each TF in turn for (itf in seq(ntf)) { cat('.') if (max(trainreg[itf,])>0) { # Train a SVM to discriminate regulated vs non-regulated targets m <- ksvm(x,trainreg[itf,],type='C-svc',kernel="rbf",C=1) # The score is the training SVM score score_sirene[itf,] <- predict(m,x,type='decision') } } # Assess performance (excluding self-regulation and known regulations) exclude <- union(trainregindexinmatrix,selfregindex) perf_SIRENE <- assessperf(score_sirene,exclude) perf_Pearson <- assessperf(simPearson,exclude) # Plot ROC curves plot(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2) lines(perf_SIRENE$roc@x.values[[1]],perf_SIRENE$roc@y.values[[1]],type='l',col=2,lwd=2) legend("topright",c('Pearson','SIRENE'),col=seq(2),lwd=2) grid() # Plot AUC plot(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',lwd=2,ylim=c(0,1)) lines(perf_SIRENE$pre@x.values[[1]],perf_SIRENE$pre@y.values[[1]],type='l',col=2,lwd=2) legend("topright",c('Pearson','SIRENE'),col=seq(2),lwd=2) grid()