##----------------------------------------------------------##
## Practical session: reconstruction of regulatory networks ##
##                                                          ##
## Jean-Philippe Vert, 10/11/2010                           ##
##----------------------------------------------------------##

# This file is available at
# http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/reginference.R
# The corresponding note is available at
# http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/reginference_notes.pdf


## Load data ##
##-----------##

# Download the data from 
# http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/reginference/data.tar.gz
# then tar xvzf data.tar.gz

# Load expression data
exp <- as.matrix(read.table('expression.txt'))
# Normalize each gene expression profile to mean 0 and standard deviation 1
exp <- t(scale(t(exp)))

# Load regulation data
reg <- read.table('regulation.txt')

# Load gene names
gname <- as.matrix(read.table('gene_names.txt'))

# List of tf and targets
tf <- sort(unique(reg[,1]))
targets <- sort(unique(reg[,2]))

## For this practical session, we reduce the number of targets to avoid long computation time. In general this is not required. ##

# Take only 200 targets randomly among all targets
ntargets <- 500
targets <- targets[sample(length(targets),ntargets)]

# Restrict regulations to those targets
reg <- reg[!is.na(match(reg[,2],targets)),]
nreg <- dim(reg)[1]

# Restrict tf to those targeting those targets
tf <- sort(unique(reg[,1]))
ntf <- length(tf)

# Detect tf in targets
tfintargets <- intersect(tf,targets)

# Indices of the possible self-regulations (target=tf) in the tf*targets matrix
selfregindex <- match(tfintargets,tf) + ntf*(match(tfintargets,targets)-1)

cat('After reduction: ',ntf,'transcription factors target',ntargets,'gene through',nreg,'regulations\n')

# Prepare network files for visualization in Cytoscape ##
netgenes<- sort(unique(c(tf,targets)))
write.table(reg,file="myedges.txt",row.names=FALSE,col.names=FALSE)
write.table(cbind(netgenes,gname[netgenes]),"mygenenames.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)

# You can now look at your network in Cytoscape by importing these files


## Assessing the performance ##
##---------------------------##

library(ROCR)

# We make the ntf*ntargets binary matrix of known regulations
truereg <- matrix(0,ntf,ntargets)
for (i in seq(nreg)) {
	truereg[match(reg[i,1],tf),match(reg[i,2],targets)]=1
}

# Function to assess the performance of a similarity matrix
allind <- ntf * ntargets
assessperf <- function(simscore,exclude=NULL) {
	# simscore should be a ntf*ntargets matrix of score
	# exclude is a list of indices in the matrix (between 1 and ntf*ntargets) to exclude from the analysis
	ind <- setdiff(seq(allind),exclude)
	pred <- prediction(simscore[ind],truereg[ind])
	roc <- performance(pred, measure = "tpr", x.measure = "fpr")
	pre <- performance(pred, measure = "prec", x.measure = "rec") 
	return(list(roc=roc,pre=pre))	
}

# For example, performance of random prediction
score_rand <- matrix(rnorm(ntf*ntargets),ntf,ntargets)
perf_rand <- assessperf(score_rand)

# Plot ROC and precision-recall curves
plot(perf_rand$roc,main='Random prediction')
grid(col='darkgray')

plot(perf_rand$pre,ylim=c(0,1),main='Random prediction')
grid(col='darkgray')


## De novo predictions with similarity-based methods ##
##---------------------------------------------------##

library(bioDist)


## Euclidean distance based similarity ##

simEuc <- matrix(0,ntf,ntargets)
for (itf in seq(ntf)){
	cat('.')
	for (itarget in seq(ntargets)) {
		simEuc[itf,itarget] <- 1-euc(exp[c(targets[itarget],tf[itf]),])
	}
}
# Assess performance (excluding self-regulation)
perf_Euc <- assessperf(simEuc,selfregindex)
# Plot performance against random
pdf(file="roc_euc_ecoli.pdf")
plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2)
lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2)
legend("topleft",c('Random','Euclidean'),col=seq(2),lwd=2)
grid(col='darkgray')
grid(col='black')
dev.off()
pdf(file="pre_euc_ecoli.pdf")
plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2)
lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2)
grid(col='black')
legend("topright",c('Random','Euclidean'),col=seq(2),lwd=2)
dev.off()


## Pearson-correlation based similarity ##

simPearson <- matrix(0,ntf,ntargets)
for (itf in seq(ntf)){
	cat('.')
	for (itarget in seq(ntargets)) {
		simPearson[itf,itarget] <- 1-cor.dist(exp[c(targets[itarget],tf[itf]),])
	}
}
# Assess performance (excluding self-regulation)
perf_Pearson <- assessperf(simPearson,selfregindex)


## Spearman-correlation based similarity ##

simSpearman <- matrix(0,ntf,ntargets)
for (itf in seq(ntf)){
	cat('.')
	for (itarget in seq(ntargets)) {
		simSpearman[itf,itarget] <- 1-spearman.dist(exp[c(targets[itarget],tf[itf]),])
	}
}
# Assess performance (excluding self-regulation)
perf_Spearman <- assessperf(simSpearman,selfregindex)


## Mutual-information based similarity ##

simMI <- matrix(0,ntf,ntargets)
for (itf in seq(ntf)){
	cat('.')	
	for (itarget in seq(ntargets)) {
		simMI[itf,itarget] <- 1-MIdist(exp[c(targets[itarget],tf[itf]),])
	}
}
# Assess performance (excluding self-regulation)
perf_MI <- assessperf(simMI,selfregindex)


# Plot all ROC curves
pdf(file="roc_cor_ecoli.pdf")
plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2)
lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2)
lines(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=3,lwd=2)
lines(perf_Spearman$roc@x.values[[1]],perf_Spearman$roc@y.values[[1]],type='l',col=4,lwd=2)
lines(perf_MI$roc@x.values[[1]],perf_MI$roc@y.values[[1]],type='l',col=5,lwd=2)
legend("topleft",c('Randome','Euclidean','Pearson','Spearman','MI'),col=seq(5),lwd=2)
grid(col='black')
dev.off()


# Plot all Precision-Recall curves
pdf(file="pre_cor_ecoli.pdf")
plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2)
lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2)
lines(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=3,lwd=2)
lines(perf_Spearman$pre@x.values[[1]],perf_Spearman$pre@y.values[[1]],type='l',col=4,lwd=2)
lines(perf_MI$pre@x.values[[1]],perf_MI$pre@y.values[[1]],type='l',col=5,lwd=2)
legend("topright",c('Random','Euclidean','Pearson','Spearman','MI'),col=seq(5),lwd=2)
grid(col='black')
dev.off()


## De novo inference with feature selection methods ##
##--------------------------------------------------##

## With LARS + stability selection

source('stabilityselection.R')
score_lars <- matrix(0,ntf,ntargets)
nbootstrap <- 500
nstepsLARS <- 5
for (itarget in seq(ntargets)) {
	cat('.')
	curtarget <- targets[itarget]
	# Find the TF to be used for prediction (all TF except the target if the target is a TF)
	predictorTFid <- !match(tf,curtarget,nomatch=0)
	
	# extract the variable for regression
	y <- exp[curtarget,]
	x <- exp[tf[predictorTFid],]

	# stability selection
	r <- stabilityselection(t(x),y,nbootstrap=nbootstrap,nsteps=nstepsLARS,plotme=FALSE)
	
	score_lars[predictorTFid,itarget] <- r
	}
perf_lars <- assessperf(score_lars,selfregindex)


## With random forests 

library(randomForest)
score_rf <- matrix(0,ntf,ntargets)
for (itarget in seq(ntargets)) {
	cat('.')
	curtarget <- targets[itarget]
	# Find the TF to be used for prediction (all TF except the target if the target is a TF)
	predictorTFid <- !match(tf, curtarget,nomatch=0)
	
	# extract the variable for regression
	y <- exp[curtarget,]
	x <- exp[tf[predictorTFid],]

	# stability selection
	r <- randomForest(t(x),y)
	
	score_rf[predictorTFid,itarget] <- r$importance
	}
perf_rf <- assessperf(score_rf,selfregindex)


# Plot all ROC curves
pdf(file="roc_ecoli.pdf")
plot(perf_rand$roc@x.values[[1]],perf_rand$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2)
lines(perf_Euc$roc@x.values[[1]],perf_Euc$roc@y.values[[1]],type='l',col=2,lwd=2)
lines(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=3,lwd=2)
lines(perf_Spearman$roc@x.values[[1]],perf_Spearman$roc@y.values[[1]],type='l',col=4,lwd=2)
lines(perf_MI$roc@x.values[[1]],perf_MI$roc@y.values[[1]],type='l',col=5,lwd=2)
lines(perf_lars$roc@x.values[[1]],perf_lars$roc@y.values[[1]],type='l',col=6,lwd=2)
lines(perf_rf$roc@x.values[[1]],perf_rf$roc@y.values[[1]],type='l',col=7,lwd=2)
legend("topleft",c('Randome','Euclidean','Pearson','Spearman','MI','Lasso','RF'),col=seq(7),lwd=2)
grid(col='black')
dev.off()


# Plot all Precision-Recall curves
pdf(file="pre_ecoli.pdf")
plot(perf_rand$pre@x.values[[1]],perf_rand$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',xlim=c(0,0.14),ylim=c(0,1),lwd=2)
lines(perf_Euc$pre@x.values[[1]],perf_Euc$pre@y.values[[1]],type='l',col=2,lwd=2)
lines(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=3,lwd=2)
lines(perf_Spearman$pre@x.values[[1]],perf_Spearman$pre@y.values[[1]],type='l',col=4,lwd=2)
lines(perf_MI$pre@x.values[[1]],perf_MI$pre@y.values[[1]],type='l',col=5,lwd=2)
lines(perf_lars$pre@x.values[[1]],perf_lars$pre@y.values[[1]],type='l',col=6,lwd=2)
lines(perf_rf$pre@x.values[[1]],perf_rf$pre@y.values[[1]],type='l',col=7,lwd=2)
legend("topright",c('Random','Euclidean','Pearson','Spearman','MI','Lasso','RF'),col=seq(7),lwd=2)
grid(col='black')
dev.off()


# Save results
save(perf_rand,perf_Euc,perf_Pearson,perf_Spearman,perf_MI,perf_lars,perf_rf,file="perf.Rdata")


# Extract the predictions at 50% precision

threshold <- perf_Pearson$pre@alpha.values[[1]][max(which(perf_Pearson$pre@y.values[[1]]>0.5))]
simPersonwithoutselfreg <- simPearson
simPersonwithoutselfreg[selfregindex] <- 0
predpairs <- which(simPersonwithoutselfreg>=threshold,arr.ind=TRUE)
predreg <- cbind(tf[predpairs[,1]] , targets[predpairs[,2]])
write.table(predreg,file="myprededges.txt",row.names=FALSE,col.names=FALSE)

# We can now look at them in Cytoscape!


## Supervised inference ##
##----------------------##

## Data preparation ##

# We assume that half the edges are known, half are unknown
trainregindex <- sample(nreg,nreg*0.8)
trainreglist <- reg[trainregindex,]
trainregindexinmatrix <- match(trainreglist[,1],tf) + ntf*(match(trainreglist[,2],targets)-1)

# Make a ntf*ntargets matrix of binary labels from the training regulations
trainreg <- matrix(-1,ntf,ntargets)
trainreg[trainregindexinmatrix] <- 1


## SIRENE scores (using a SVM) ##
## Note that for sake of simplicity we implement below a very simple method:
## train a SVM to discriminate known regulated genes from unknown ones, and use
## the training SVM score on the unknown ones to predict missing regulations.
## Better methods for learning from positive and unlabeled examples are possible.

require(kernlab)

# By default we give a very bad score (eg, if no training gene is available for a tf and we do not even train a SVM)
score_sirene <- matrix(-10,ntf,ntargets)

# Extract the expression matrix for all target genes
x <- exp[targets,]

# Consider each TF in turn
for (itf in seq(ntf)) {
	cat('.')
	if (max(trainreg[itf,])>0) {
		
		# Train a SVM to discriminate regulated vs non-regulated targets
		m <- ksvm(x,trainreg[itf,],type='C-svc',kernel="rbf",C=1)
		
		# The score is the training SVM score
		score_sirene[itf,] <- predict(m,x,type='decision')
		
	}
}

# Assess performance (excluding self-regulation and known regulations)
exclude <- union(trainregindexinmatrix,selfregindex)
perf_SIRENE <- assessperf(score_sirene,exclude)
perf_Pearson <- assessperf(simPearson,exclude) 


# Plot ROC curves
plot(perf_Pearson$roc@x.values[[1]],perf_Pearson$roc@y.values[[1]],type='l',col=1,xlab='FPR',ylab='TPR',lwd=2)
lines(perf_SIRENE$roc@x.values[[1]],perf_SIRENE$roc@y.values[[1]],type='l',col=2,lwd=2)
legend("topright",c('Pearson','SIRENE'),col=seq(2),lwd=2)
grid()

# Plot AUC
plot(perf_Pearson$pre@x.values[[1]],perf_Pearson$pre@y.values[[1]],type='l',col=1,xlab='Recall',ylab='Precision',lwd=2,ylim=c(0,1))
lines(perf_SIRENE$pre@x.values[[1]],perf_SIRENE$pre@y.values[[1]],type='l',col=2,lwd=2)
legend("topright",c('Pearson','SIRENE'),col=seq(2),lwd=2)
grid()