%last updated: 11/01/2010
%Copyright: Kevin Bleakley 2010.
%Purpose: script to do drug-target interaction prediction. Uses 10 fold cross-validation.
%Requires: the libsvm package. [GIVE URL!]
%The actual datasets we used in the correct format are downloadable
%from the same page you got this file. So you can see them run before trying to use your own data.
%Output: gives two continuously-valued scores for each putative drug-target interaction. Also,
%we calculate ROC curves and Area under them (AUC).
%General remarks:
%1) The code should work on similarity matrices also, and not just positive semi-definite
% kernel matrices.
%2) You have to have pre-calculated your similarity/kernel matrices in advance.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%here we load the dataset(s) and, as they are textfiles converted from a format needing to be read by
%"R", they have one extra column to be removed before proceeding. You have to comment or uncomment the
%data set that you want to use.
% %1) ENZYMES
% %adjacency matrix:
% load e_admat_dgc2.txt
% y = e_admat_dgc2(:,2:(size(e_admat_dgc2,2)));
% %compound similarity matrix:
% load e_simmat_dc2.txt
% kCompound = e_simmat_dc2(:,2:(size(e_simmat_dc2,2)));
% %target similarity matrix:
% load e_simmat_dg2.txt
% kTarget = e_simmat_dg2(:,2:(size(e_simmat_dg2,2)));
% %2) ION CHANNELS
% %adjacency matrix:
% load ic_admat_dgc2.txt
% y = ic_admat_dgc2(:,2:(size(ic_admat_dgc2,2)));
% %compound similarity matrix:
% load ic_simmat_dc2.txt
% kCompound = ic_simmat_dc2(:,2:(size(ic_simmat_dc2,2)));
% %target similarity matrix:
% load ic_simmat_dg2.txt
% kTarget = ic_simmat_dg2(:,2:(size(ic_simmat_dg2,2)));
% %3)GPCRs
% %adjacency matrix:
% load gpcr_admat_dgc2.txt
% y = gpcr_admat_dgc2(:,2:(size(gpcr_admat_dgc2,2)));
% %compound similarity matrix:
% load gpcr_simmat_dc2.txt
% kCompound = gpcr_simmat_dc2(:,2:(size(gpcr_simmat_dc2,2)));
% %target similarity matrix:
% load gpcr_simmat_dg2.txt
% kTarget = gpcr_simmat_dg2(:,2:(size(gpcr_simmat_dg2,2)));
%4)NUCLEAR RECEPTORS
%adjacency matrix:
load nr_admat_dgc2.txt
y = nr_admat_dgc2(:,2:(size(nr_admat_dgc2,2)));
%compound similarity matrix:
load nr_simmat_dc2.txt
kCompound = nr_simmat_dc2(:,2:(size(nr_simmat_dc2,2)));
%target similarity matrix:
load nr_simmat_dg2.txt
kTarget = nr_simmat_dg2(:,2:(size(nr_simmat_dg2,2)));
%We had problems with kCompound having imaginary eigenvalues. Here, we symmetrize kCompound
%before continuing:
kCompound = (kCompound + kCompound')/2;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Checking for positive semi-definite. This is because, at present, these are "similarity" matrices rather
%than "kernel" matrices. For now, we just add increments of a small epsilon to the diagonal, until the
%matrix becomes positive semi-definite:
epsilon = .1;
while sum(eig(kCompound) >= 0) < size(kCompound,1) | isreal(eig(kCompound))==0
kCompound = kCompound + epsilon*eye(size(kCompound,1));
end
while sum(eig(kTarget) >= 0) < size(kTarget,1) | isreal(eig(kTarget))==0
kTarget = kTarget + epsilon*eye(size(kTarget,1));
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%intialise some variables:
lengthKCompound = size(kCompound,1);
myPredictions = zeros(lengthKCompound,size(y,1));
lengthKTarget = size(kTarget,1);
myPredictions2 = zeros(lengthKTarget,size(y,2));
numbEdges1 = zeros(lengthKCompound,size(y,1));
numbEdges2 = zeros(lengthKTarget,size(y,2));
%randomly permute the order of the drugs and compounds.:
firstRP = randperm(size(kCompound,1));
secondRP = randperm(size(kTarget,1));
kCompound = kCompound(firstRP,firstRP);
kTarget = kTarget(secondRP,secondRP);
y = y(secondRP,firstRP);
%and we have to save the 'inverse' of these permutations:
[myback1 mb1] = sort(firstRP);
[myback2 mb2] = sort(secondRP);
%define the number of cross-validation splits:
numOfSplits = 10;
theseIndices = zeros(numOfSplits,2);
thebasic = floor(size(y,1)/numOfSplits);
theseIndices(1,1) = 1;
for gg = 1:(numOfSplits-1)
theseIndices(gg,2) = thebasic*gg;
theseIndices(gg+1,1) = thebasic*gg + 1;
end
theseIndices(numOfSplits,2) = size(y,1);
theseIndices2 = zeros(numOfSplits,2);
thebasic2 = floor(size(y,2)/numOfSplits);
theseIndices2(1,1) = 1;
for gg = 1:(numOfSplits-1)
theseIndices2(gg,2) = thebasic2*gg;
theseIndices2(gg+1,1) = thebasic2*gg + 1;
end
theseIndices2(numOfSplits,2) = size(y,2);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each target in turn, then doing ten fold cross validation
%on the set of compounds, to predict which ones
%are/are not targeting the target in question:
for i=1:size(y,1);
i
pause(.3)
currentY = y(i,:)';
for j = 1:lengthKCompound
%%%!%%%
whatSplit = sum(j >= theseIndices2(:,1));
%%%!%%%
%j;
if sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1) > 0
%these are the weights for positive and negative examples. They alter the behavior of libsvm.
m2a = 1;
m2b = 1;
%defining the train and test matrices that will be input into libsvm.
%%%!%%%
K1 = [[1:(lengthKCompound-(theseIndices2(whatSplit,2)-theseIndices2(whatSplit,1)+1))]' kCompound(setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]),setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
K2 = [(j)' kCompound(j,setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
%%%!%%%
%training:
model = svmtrain(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
%testing:
[predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
%%!%%
%this line is to fix a weakness in libsvm, the first training label is always automatically
%labeled +1, even if it's -1 !
firstLabel = currentY(1)*(j>theseIndices2(1,2)) + currentY(theseIndices2(2,1))*(j<=theseIndices2(1,2));
%%!%%
myPredictions(j,i) = dec_values*sign(firstLabel - 1/2);
numbEdges1(j,i) = sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);
else
myPredictions(j,i) = -1;
numbEdges1(j,i) = 0;
end
end
end
%un-permuting the row and column orders:
myPredictions = myPredictions(mb1,mb2);
ytemp = y';
ytemp = ytemp(mb1,mb2);
ytemp = ytemp';
%Turning myPredictions and "y" into vectors:
myScores =[];
myTrueLabels = [];
for k = 1:lengthKCompound
myScores = [myScores myPredictions(k,:)];
myTrueLabels = [myTrueLabels ytemp(:,k)'];
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each compound in turn, then doing ten fold cross validation
%on the set of targets, to predict which ones
%are/are not targeted by the compound in question:
for i= 1:size(y,2);
i;
currentY = y(:,i);
for j = 1:lengthKTarget
%%%!%%%
whatSplit = sum(j >= theseIndices(:,1));
%%%!%%%
if sum(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))) == 1) > 0
%these are the weights for positive and negative examples. They alter the behavior of
%libsvm.
m2a = 1;
m2b = 1;
%matrices to input to libsvm:
K1 = [[1:(lengthKTarget-(theseIndices(whatSplit,2)-theseIndices(whatSplit,1)+1))]' kTarget(setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]),setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
K2 = [(j)' kTarget(j,setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
%training:
model = svmtrain(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
%testing:
[predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
%fixing libsvms desire to label -1 examples as +1 examples if they are the first item in the
%training set.
firstLabel = currentY(1)*(j>theseIndices(1,2)) + currentY(theseIndices(2,1))*(j<=theseIndices(1,2));
myPredictions2(j,i) = dec_values*sign(firstLabel - 1/2);
numbEdges2(j,i) = sum(currentY(setdiff(1:lengthKTarget,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);
else
myPredictions2(j,i) = -1;
numbEdges2(j,i) = 0;
end
end
end
%un-permuting the row and column orders:
myPredictions2 = myPredictions2';
myPredictions2 = myPredictions2(mb1,mb2);
myPredictions2 = myPredictions2';
y = y(mb2,mb1);
%Turning myPredictions and "y" into vectors:
myScores2 =[];
myTrueLabels2 = [];
for k = 1:lengthKTarget
myScores2 = [myScores2 myPredictions2(k,:)];
myTrueLabels2 = [myTrueLabels2 y(k,:)];
end
%and again, but this time by column, (for later):
myScores2b =[];
myTrueLabels2b = [];
for k = 1:lengthKCompound
myScores2b = [myScores2b myPredictions2(:,k)'];
myTrueLabels2b = [myTrueLabels2b y(:,k)'];
end
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%AREA UNDER ROC CURVE for fixing targets in turn:
myNumb = length((-2.5):.001:(2.5));
TruPos = zeros(1,myNumb);
FalPos = zeros(1,myNumb);
countMan = 0;
for moveit = (-2.5):.001:(2.5)
countMan = countMan + 1;
TruPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==1);
FalPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==0);
end
plot(FalPos/max(FalPos),TruPos/max(TruPos),'r');
hold;
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx = FalPos/max(FalPos);
yy = TruPos/max(TruPos);
areaUnderROC = 0;
old = [0 0];
for bb = 1:myNumb
new = [xx(bb) yy(bb)];
if new(1) == old(1) & new(2) > old(2)
old = new;
elseif new(1) > old(1) & new(2) > old(2)
areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
old = new;
elseif new(1) > old(1) & new(2) == old(2)
areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
old = new;
end
end
areaUnderROC
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for fixing compounds in turn:
myNumb = length((-2.5):.001:(2.5));
TruPos2 = zeros(1,myNumb);
FalPos2 = zeros(1,myNumb);
countMan = 0;
for moveit = (-2.5):.001:(2.5)
countMan = countMan + 1;
TruPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==1);
FalPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==0);
end
plot(FalPos2/max(FalPos2),TruPos2/max(TruPos2),'b');
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx = FalPos2/max(FalPos2);
yy = TruPos2/max(TruPos2);
areaUnderROC = 0;
old = [0 0];
for bb = 1:myNumb
new = [xx(bb) yy(bb)];
if new(1) == old(1) & new(2) > old(2)
old = new;
elseif new(1) > old(1) & new(2) > old(2)
areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
old = new;
elseif new(1) > old(1) & new(2) == old(2)
areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
old = new;
end
end
areaUnderROC
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for both situations added together:
%various ways of combining the two predictions made for each edge:
%1) adding them:
%myPredictions3 = myPredictions + myPredictions2';
%2) taking the largest:
myPredictions3 = max(myPredictions,myPredictions2');
myScores3 =[];
myTrueLabels3 = [];
for k = 1:lengthKCompound
myScores3 = [myScores3 myPredictions3(k,:)];
myTrueLabels3 = [myTrueLabels3 y(:,k)'];
end
myNumb = length((-5):.001:(5));
TruPos3 = zeros(1,myNumb);
FalPos3 = zeros(1,myNumb);
countMan = 0;
for moveit = (-5):.001:(5)
countMan = countMan + 1;
TruPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==1);
FalPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==0);
end
plot(FalPos3/max(FalPos3),TruPos3/max(TruPos3),'g');
hold off;
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xx3 = FalPos3/max(FalPos3);
yy3 = TruPos3/max(TruPos3);
areaUnderROC3 = 0;
old3 = [0 0];
for bb = 1:length(xx3)
new3 = [xx3(bb) yy3(bb)];
if new3(1) == old3(1) & new3(2) > old3(2)
old3 = new3;
elseif new3(1) > old3(1) & new3(2) > old3(2)
areaUnderROC3 = areaUnderROC3 + ((old3(2) + new3(2))/2) * (new3(1) - old3(1));
old3 = new3;
elseif new3(1) > old3(1) & new3(2) == old3(2)
areaUnderROC3 = areaUnderROC3 + old3(2) * (new3(1) - old3(1));
old3 = new3;
end
end
areaUnderROC3
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%here we calculate area under precision-recall curve, only for the 3rd "max" data set.
%by copying and pasting and changing all the '3' to '1' or '2',
%you can do this for the TruePos1 etc., and TruePos2 etc.
%FDR curves and/or precision-recall curves:
xFDRPR = TruPos3/max(TruPos3);
yFDR = FalPos3./(FalPos3 + TruPos3);
yPR = 1 - yFDR;
plot(xFDRPR,yFDR);
plot(xFDRPR,yPR);
%area under precision accuracy curve: (actually it's a code for area under the FDR curve, so
%in the last line we do "1 - result".
xxPR = TruPos3/max(TruPos3);
yyPR = FalPos3./(FalPos3 + TruPos3 + .000000001*(FalPos3==0 & TruPos3==0));
xRun = 0;
addToSum = 0;
myXa = 0;
for myCounta = 1:myNumb
if xxPR(myCounta) > xRun & myCounta > 1
myXb = xxPR(myCounta);
addToSum = addToSum + (myXb - myXa)*(yyPR(myCounta - 1) + yyPR(myCounta))/2;
myXa = myXb;
xRun = xxPR(myCounta);
elseif xxPR(myCounta) > xRun & myCounta == 1
myXa = xxPR(1);
xRun = xxPR(myCounta);
end
end
AUPR = 1 - addToSum
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%