@comment{{This file has been generated by bib2bib 1.97}}
@comment{{Command line: bib2bib ../bibli.bib -c 'subject:"biosvm" or keywords:"biosvm"' -ob tmp.bib}}
@article{Aires-de-Sousa2005Prediction, author = {Aires-de-Sousa, J. and Gasteiger, J.}, title = {Prediction of enantiomeric excess in a combinatorial library of catalytic enantioselective reactions.}, journal = {J {C}omb {C}hem}, year = {2005}, volume = {7}, pages = {298-301}, number = {2}, abstract = {A quantitative structure-enantioselectivity relationship was established for a combinatorial library of enantioselective reactions performed by addition of diethyl zinc to benzaldehyde. {C}hiral catalysts and additives were encoded by their chirality codes and presented as input to neural networks. {T}he networks were trained to predict the enantiomeric excess. {W}ith independent test sets, predictions of enantiomeric excess could be made with an average error as low as 6\% ee. {M}ultilinear regression, perceptrons, and support vector machines were also evaluated as modeling tools. {T}he method is of interest for the computer-aided design of combinatorial libraries involving chiral compounds or enantioselective reactions. {T}his is the first example of a quantitative structure-property relationship based on chirality codes.}, doi = {10.1021/cc049961q}, pdf = {../local/Aires-de-Sousa2005Prediction.pdf}, file = {Aires-de-Sousa2005Prediction.pdf:local/Aires-de-Sousa2005Prediction.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/cc049961q} }
@inproceedings{Aliferis2002Machine, author = {Aliferis, C.F. and Hardin, D.P. and Massion, P.}, title = {Machine {L}earning {M}odels {F}or {L}ung {C}ancer {C}lassification {U}sing {A}rray {C}omparative {G}enomic {H}ybridization}, booktitle = {Proceedings of the 2002 {A}merican {M}edical {I}nformatics {A}ssociation ({AMIA}) {A}nnual {S}ymposium}, year = {2002}, pages = {7-11}, abstract = {Array {CGH} is a recently introduced technology that measures changes in the gene copy number of hundreds of genes in a single experiment. {T}he primary goal of this study was to develop machine learning models that classify non-small {L}ung {C}ancers according to histopathology types and to compare several machine learning methods in this learning task. {DNA} from tumors of 37 patients (21 squamous carcinomas, and 16 adenocarcinomas) were extracted and hybridized onto a 452 {BAC} clone array. {T}he following algorithms were used: {KNN}, {D}ecision {T}ree {I}nduction, {S}upport {V}ector {M}achines and {F}eed-{F}orward {N}eural {N}etworks. {P}erformance was measured via leave-one-out classification accuracy. {T}he best multi-gene model found had a leave-one-out accuracy of 89.2\%. {D}ecision {T}rees performed poorer than the other methods in this learning task and dataset. {W}e conclude that gene copy numbers as measured by array {CGH} are, collectively, an excellent indicator of histological subtype. {S}everal interesting research directions are discussed.}, pdf = {../local/Aliferis2002Machine.pdf}, file = {Aliferis2002Machine.pdf:local/Aliferis2002Machine.pdf:PDF}, keywords = {biosvm microarray, cgh}, owner = {jeanphilippevert} }
@article{Ambroise2002Selection, author = {Ambroise, C. and McLachlan, G.J.}, title = {Selection bias in gene extraction on the basis of microarray gene-expression data}, journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}}, year = {2002}, volume = {99}, pages = {6562-6566}, number = {10}, abstract = {In the context of cancer diagnosis and treatment, we consider the problem of constructing an accurate prediction rule on the basis of a relatively small number of tumor tissue samples of known type containing the expression data on very many (possibly thousands) genes. {R}ecently, results have been presented in the literature suggesting that it is possible to construct a prediction rule from only a few genes such that it has a negligible prediction error rate. {H}owever, in these results the test error or the leave-one-out cross-validated error is calculated without allowance for the selection bias. {T}here is no allowance because the rule is either tested on tissue samples that were used in the first instance to select the genes being used in the rule or because the cross-validation of the rule is not external to the selection process; that is, gene selection is not performed in training the rule at each stage of the cross-validation process. {W}e describe how in practice the selection bias can be assessed and corrected for by either performing a cross-validation or applying the bootstrap external to the selection process. {W}e recommend using 10-fold rather than leave-one-out cross-validation, and concerning the bootstrap, we suggest using the so-called .632+ bootstrap error estimate designed to handle overfitted prediction rules. {U}sing two published data sets, we demonstrate that when correction is made for the selection bias, the cross-validated error is no longer zero for a subset of only a few genes.}, pdf = {../local/Ambroise2002Selection.pdf}, file = {Ambroise2002Selection.pdf:local/Ambroise2002Selection.pdf:PDF}, keywords = {featureselection biosvm}, owner = {jeanphilippevert}, url = {http://www.pnas.org/cgi/content/abstract/99/10/6562} }
@article{Anderson2003new, author = {Anderson, D.C. and Li, W. and Payan, D.G. and Noble, W.S.}, title = {A new algorithm for the evaluation of shotgun peptide sequencing in proteomics: support vector machine classification of peptide {{MS}/{MS}} spectra and {SEQUEST} scores.}, journal = {J {P}roteome {R}es}, year = {2003}, volume = {2}, pages = {137-146}, number = {2}, abstract = {Shotgun tandem mass spectrometry-based peptide sequencing using programs such as {SEQUEST} allows high-throughput identification of peptides, which in turn allows the identification of corresponding proteins. {W}e have applied a machine learning algorithm, called the support vector machine, to discriminate between correctly and incorrectly identified peptides using {SEQUEST} output. {E}ach peptide was characterized by {SEQUEST}-calculated features such as delta {C}n and {X}corr, measurements such as precursor ion current and mass, and additional calculated parameters such as the fraction of matched {MS}/{MS} peaks. {T}he trained {SVM} classifier performed significantly better than previous cutoff-based methods at separating positive from negative peptides. {P}ositive and negative peptides were more readily distinguished in training set data acquired on a {QTOF}, compared to an ion trap mass spectrometer. {T}he use of 13 features, including four new parameters, significantly improved the separation between positive and negative peptides. {U}se of the support vector machine and these additional parameters resulted in a more accurate interpretation of peptide {MS}/{MS} spectra and is an important step toward automated interpretation of peptide tandem mass spectrometry data in proteomics.}, pdf = {../local/Anderson2003new.pdf}, file = {Anderson2003new.pdf:local/Anderson2003new.pdf:PDF}, keywords = {biosvm proteomics}, owner = {jeanphilippevert} }
@article{Aphinyanaphongs2005Text, author = {Yindalon Aphinyanaphongs and Ioannis Tsamardinos and Alexander Statnikov and Douglas Hardin and Constantin F Aliferis}, title = {Text categorization models for high-quality article retrieval in internal medicine.}, journal = {J. {A}m. {M}ed. {I}nform. {A}ssoc.}, year = {2005}, volume = {12}, pages = {207-16}, number = {2}, abstract = {O{BJECTIVE} {F}inding the best scientific evidence that applies to a patient problem is becoming exceedingly difficult due to the exponential growth of medical publications. {T}he objective of this study was to apply machine learning techniques to automatically identify high-quality, content-specific articles for one time period in internal medicine and compare their performance with previous {B}oolean-based {P}ub{M}ed clinical query filters of {H}aynes et al. {DESIGN} {T}he selection criteria of the {ACP} {J}ournal {C}lub for articles in internal medicine were the basis for identifying high-quality articles in the areas of etiology, prognosis, diagnosis, and treatment. {N}aive {B}ayes, a specialized {A}da{B}oost algorithm, and linear and polynomial support vector machines were applied to identify these articles. {MEASUREMENTS} {T}he machine learning models were compared in each category with each other and with the clinical query filters using area under the receiver operating characteristic curves, 11-point average recall precision, and a sensitivity/specificity match method. {RESULTS} {I}n most categories, the data-induced models have better or comparable sensitivity, specificity, and precision than the clinical query filters. {T}he polynomial support vector machine models perform the best among all learning methods in ranking the articles as evaluated by area under the receiver operating curve and 11-point average recall precision. {CONCLUSION} {T}his research shows that, using machine learning methods, it is possible to automatically build models for retrieving high-quality, content-specific articles using inclusion or citation by the {ACP} {J}ournal {C}lub as a gold standard in a given time period in internal medicine that perform better than the 1994 {P}ub{M}ed clinical query filters.}, doi = {10.1197/jamia.M1641}, pdf = {../local/Aphinyanaphongs2005Text.pdf}, file = {Aphinyanaphongs2005Text.pdf:local/Aphinyanaphongs2005Text.pdf:PDF}, keywords = {biosvm nlp}, pii = {M1641}, url = {http://dx.doi.org/10.1197/jamia.M1641} }
@article{Arimoto2005Development, author = {Rieko Arimoto and Madhu-Ashni Prasad and Eric M Gifford}, title = {Development of {CYP}3{A}4 inhibition models: comparisons of machine-learning techniques and molecular descriptors.}, journal = {J {B}iomol {S}creen}, year = {2005}, volume = {10}, pages = {197-205}, number = {3}, month = {Apr}, abstract = {Computational models of cytochrome {P}450 3{A}4 inhibition were developed based on high-throughput screening data for 4470 proprietary compounds. {M}ultiple models differentiating inhibitors ({IC}(50) <3 micro{M}) and noninhibitors were generated using various machine-learning algorithms (recursive partitioning [{RP}], {B}ayesian classifier, logistic regression, k-nearest-neighbor, and support vector machine [{SVM}]) with structural fingerprints and topological indices. {N}ineteen models were evaluated by internal 10-fold cross-validation and also by an independent test set. {T}hree most predictive models, {B}arnard {C}hemical {I}nformation ({BCI})-fingerprint/{SVM}, {MDL}-keyset/{SVM}, and topological indices/{RP}, correctly classified 249, 248, and 236 compounds of 291 noninhibitors and 135, 137, and 147 compounds of 179 inhibitors in the validation set. {T}heir overall accuracies were 82\%, 82\%, and 81\%, respectively. {I}nvestigating applicability of the {BCI}/{SVM} model found a strong correlation between the predictive performance and the structural similarity to the training set. {U}sing {T}animoto similarity index as a confidence measurement for the predictions, the limitation of the extrapolation was 0.7 in the case of the {BCI}/{SVM} model. {T}aking consensus of the 3 best models yielded a further improvement in predictive capability, kappa = 0.65 and accuracy = 83\%. {T}he consensus model could also be tuned to minimize either false positives or false negatives depending on the emphasis of the screening.}, doi = {10.1177/1087057104274091}, keywords = {biosvm chemoinformatics}, pii = {10/3/197}, url = {http://dx.doi.org/10.1177/1087057104274091} }
@article{Arodz2005Pattern, author = {Tomasz Arod{\'z} and Marcin Kurdziel and Erik O D Sevre and David A Yuen}, title = {Pattern recognition techniques for automatic detection of suspicious-looking anomalies in mammograms.}, journal = {Comput. {M}ethods {P}rograms {B}iomed.}, year = {2005}, volume = {79}, pages = {135-49}, number = {2}, month = {Aug}, abstract = {We have employed two pattern recognition methods used commonly for face recognition in order to analyse digital mammograms. {T}he methods are based on novel classification schemes, the {A}da{B}oost and the support vector machines ({SVM}). {A} number of tests have been carried out to evaluate the accuracy of these two algorithms under different circumstances. {R}esults for the {A}da{B}oost classifier method are promising, especially for classifying mass-type lesions. {I}n the best case the algorithm achieved accuracy of 76\% for all lesion types and 90\% for masses only. {T}he {SVM} based algorithm did not perform as well. {I}n order to achieve a higher accuracy for this method, we should choose image features that are better suited for analysing digital mammograms than the currently used ones.}, doi = {10.1016/j.cmpb.2005.03.009}, pdf = {../local/Arodz2005Pattern.pdf}, file = {Arodz2005Pattern.pdf:local/Arodz2005Pattern.pdf:PDF}, keywords = {biosvm image}, pii = {S0169-2607(05)00083-0}, url = {http://dx.doi.org/10.1016/j.cmpb.2005.03.009} }
@article{Atalay2005Implicit, author = {Atalay, V. and Cetin-Atalay, R.}, title = {Implicit motif distribution based hybrid computational kernel for sequence classification}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1429-1436}, number = {8}, month = {Apr}, abstract = {M{OTIVATION}: {W}e designed a general computational kernel for classification problems that require specific motif extraction and search from sequences. {I}nstead of searching for explicit motifs, our approach finds the distribution of implicit motifs and uses as a feature for classification. {I}mplicit motif distribution approach may be used as modus operandi for bioinformatics problems that require specific motif extraction and search, which is otherwise computationally prohibitive. {RESULTS}: {A} system named {P}2{SL} that infer protein subcellular targeting was developed through this computational kernel. {T}argeting-signal was modeled by the distribution of subsequence occurrences (implicit motifs) using self-organizing maps. {T}he boundaries among the classes were then determined with a set of support vector machines. {P}2{SL} hybrid computational system achieved approximately 81\% of prediction accuracy rate over {ER} targeted, cytosolic, mitochondrial and nuclear protein localization classes. {P}2{SL} additionally offers the distribution potential of proteins among localization classes, which is particularly important for proteins, shuttle between nucleus and cytosol. {AVAILABILITY}: http://staff.vbi.vt.edu/volkan/p2sl and http://www.i-cancer.fen.bilkent.edu.tr/p2sl {CONTACT}: rengul@bilkent.edu.tr.}, doi = {10.1093/bioinformatics/bti212}, pdf = {../local/Atalay2005Implicit.pdf}, file = {Atalay2005Implicit.pdf:local/Atalay2005Implicit.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/bioinformatics/bti212} }
@article{Bao2005Identifying, author = {Lei Bao}, title = {Identifying genes related to chemosensitivity using support vector machine.}, journal = {Methods {M}ol {M}ed}, year = {2005}, volume = {111}, pages = {233-40}, abstract = {In an effort to identify genes involved in chemosensitivity and to evaluate the functional relationships between genes and anticancer drugs acting by the same mechanism, a supervised machine learning approach called support vector machine ({SVM}) is used to associate genes with any of five predefined anticancer drug mechanistic categories. {T}he drug activity profiles are used as training examples to train the {SVM} and then the gene expression profiles are used as test examples to predict their associated mechanistic categories. {T}his method of correlating drugs and genes provides a strategy for finding novel biologically significant relationships for molecular pharmacology.}, keywords = {biosvm}, pii = {1-59259-889-7:233} }
@article{Bao2005Prediction, author = {Lei Bao and Yan Cui}, title = {Prediction of the phenotypic effects of non-synonymous single nucleotide polymorphisms using structural and evolutionary information.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2185-90}, number = {10}, month = {May}, abstract = {M{OTIVATION}: {T}here has been great expectation that the knowledge of an individual's genotype will provide a basis for assessing susceptibility to diseases and designing individualized therapy. {N}on-synonymous single nucleotide polymorphisms (ns{SNP}s) that lead to an amino acid change in the protein product are of particular interest because they account for nearly half of the known genetic variations related to human inherited diseases. {T}o facilitate the identification of disease-associated ns{SNP}s from a large number of neutral ns{SNP}s, it is important to develop computational tools to predict the phenotypic effects of ns{SNP}s. {RESULTS}: {W}e prepared a training set based on the variant phenotypic annotation of the {S}wiss-{P}rot database and focused our analysis on ns{SNP}s having homologous 3{D} structures. {S}tructural environment parameters derived from the 3{D} homologous structure as well as evolutionary information derived from the multiple sequence alignment were used as predictors. {T}wo machine learning methods, support vector machine and random forest, were trained and evaluated. {W}e compared the performance of our method with that of the {SIFT} algorithm, which is one of the best predictive methods to date. {A}n unbiased evaluation study shows that for ns{SNP}s with sufficient evolutionary information (with not <10 homologous sequences), the performance of our method is comparable with the {SIFT} algorithm, while for ns{SNP}s with insufficient evolutionary information (<10 homologous sequences), our method outperforms the {SIFT} algorithm significantly. {T}hese findings indicate that incorporating structural information is critical to achieving good prediction accuracy when sufficient evolutionary information is not available. {AVAILABILITY}: {T}he codes and curated dataset are available at http://compbio.utmem.edu/snp/dataset/}, doi = {10.1093/bioinformatics/bti365}, pdf = {../local/Bao2005Prediction.pdf}, file = {Bao2005Prediction.pdf:local/Bao2005Prediction.pdf:PDF}, keywords = {biosvm}, pii = {bti365}, url = {http://dx.doi.org/10.1093/bioinformatics/bti365} }
@article{Bao2002Identifying, author = {Bao, L. and Sun, Z.}, title = {Identifying genes related to drug anticancer mechanisms using support vector machine}, journal = {F{EBS} {L}ett.}, year = {2002}, volume = {521}, pages = {109--114}, abstract = {In an effort to identify genes related to the cell line chemosensitivity and to evaluate the functional relationships between genes and anticancer drugs acting by the same mechanism, a supervised machine learning approach called support vector machine was used to label genes into any of the five predefined anticancer drug mechanistic categories. {A}mong dozens of unequivocally categorized genes, many were known to be causally related to the drug mechanisms. {F}or example, a few genes were found to be involved in the biological process triggered by the drugs (e.g. {DNA} polymerase epsilon was the direct target for the drugs from {DNA} antimetabolites category). {DNA} repair-related genes were found to be enriched for about eight-fold in the resulting gene set relative to the entire gene set. {S}ome uncharacterized transcripts might be of interest in future studies. {T}his method of correlating the drugs and genes provides a strategy for finding novel biologically significant relationships for molecular pharmacology.}, pdf = {../local/bao02.pdf}, file = {bao02.pdf:local/bao02.pdf:PDF}, keywords = {biosvm microarray}, subject = {biokernel}, url = {http://www.elsevier.com/febs/402/19/42/article.html} }
@article{Baumgartner2004Supervised, author = {Baumgartner, C. and Bohm, C. and Baumgartner, D. and Marini, G. and Weinberger, K. and Olgemoller, B. and Liebl, B. and Roscher, A. A.}, title = {Supervised machine learning techniques for the classification of metabolic disorders in newborns}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {2985-2996}, number = {17}, abstract = {Motivation: {D}uring the {B}avarian newborn screening programme all newborns have been tested for about 20 inherited metabolic disorders. {O}wing to the amount and complexity of the generated experimental data, machine learning techniques provide a promising approach to investigate novel patterns in high-dimensional metabolic data which form the source for constructing classification rules with high discriminatory power. {R}esults: {S}ix machine learning techniques have been investigated for their classification accuracy focusing on two metabolic disorders, phenylketo nuria ({PKU}) and medium-chain acyl-{C}o{A} dehydrogenase deficiency ({MCADD}). {L}ogistic regression analysis led to superior classification rules (sensitivity >96.8%, specificity >99.98%) compared to all investigated algorithms. {I}ncluding novel constellations of metabolites into the models, the positive predictive value could be strongly increased ({PKU} 71.9% versus 16.2%, {MCADD} 88.4% versus 54.6% compared to the established diagnostic markers). {O}ur results clearly prove that the mined data confirm the known and indicate some novel metabolic patterns which may contribute to a better understanding of newborn metabolism. {A}vailability: {WEKA} machine learning package: www.cs.waikato.ac.nz/~ml/weka and statistical software package {ADE}-4: http://pbil.univ-lyon1.fr/{ADE}-4}, doi = {10.1093/bioinformatics/bth343}, pdf = {../local/Baumgartner2004Supervised.pdf}, file = {Baumgartner2004Supervised.pdf:local/Baumgartner2004Supervised.pdf:PDF}, keywords = {biosvm proteomics}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/2985} }
@article{Bazzani2001SVM, author = {A. Bazzani and A. Bevilacqua and D. Bollini and R. Brancaccio and R. Campanini and N. Lanconelli and A. Riccardi and D. Romani}, title = {An {SVM} classifier to separate false signals from microcalcifications in digital mammograms.}, journal = {Phys {M}ed {B}iol}, year = {2001}, volume = {46}, pages = {1651-63}, number = {6}, month = {Jun}, abstract = {In this paper we investigate the feasibility of using an {SVM} (support vector machine) classifier in our automatic system for the detection of clustered microcalcifications in digital mammograms. {SVM} is a technique for pattern recognition which relies on the statistical learning theory. {I}t minimizes a function of two terms: the number of misclassified vectors of the training set and a term regarding the generalization classifier capability. {W}e compare the {SVM} classifier with an {MLP} (multi-layer perceptron) in the false-positive reduction phase of our detection scheme: a detected signal is considered either microcalcification or false signal, according to the value of a set of its features. {T}he {SVM} classifier gets slightly better results than the {MLP} one ({A}z value of 0.963 against 0.958) in the presence of a high number of training data; the improvement becomes much more evident ({A}z value of 0.952 against 0.918) in training sets of reduced size. {F}inally, the setting of the {SVM} classifier is much easier than the {MLP} one.}, doi = {10.1088/0031-9155/46/6/305}, pdf = {../local/Bazzani2001SVM.pdf}, file = {Bazzani2001SVM.pdf:local/Bazzani2001SVM.pdf:PDF}, keywords = {biosvm image}, url = {http://dx.doi.org/10.1088/0031-9155/46/6/305} }
@article{Beerenwinkel2003Methods, author = {Beerenwinkel, N. and Lengauer, T. and Daumer, M. and Kaiser, R. and Walter, H. and Korn, K. and Hoffmann, D. and Selbig, J.}, title = {Methods for optimizing antiviral combination therapies}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {i16-i25}, number = {Suppl. 1}, abstract = {Motivation: {D}espite some progress with antiretroviral combination therapies, therapeutic success in the management of {HIV}-infected patients is limited. {T}he evolution of drug-resistant genetic variants in response to therapy plays a key role in treatment failure and finding a new potent drug combination after therapy failure is considered challenging. {R}esults: {T}o estimate the activity of a drug combination against a particular viral strain, we develop a scoring function whose independent variables describe a set of antiviral agents and viral {DNA} sequences coding for the molecular targets of the respective drugs. {T}he construction of this activity score involves (1) predicting phenotypic drug resistance from genotypes for each drug individually, (2) probabilistic modeling of predicted resistance values and integration into a score for drug combinations, and (3) searching through the mutational neighborhood of the considered strain in order to estimate activity on nearby mutants. {F}or a clinical data set, we determine the optimal search depth and show that the scoring scheme is predictive of therapeutic outcome. {P}roperties of the activity score and applications are discussed. {C}ontact: beerenwinkel@mpi-sb.mpg.de {K}eywords: {HIV}, antiretroviral therapy, drug resistance, {SVM} regression, therapy optimization, sequence space search.}, pdf = {../local/Beerenwinkel2003Methods.pdf}, file = {Beerenwinkel2003Methods.pdf:local/Beerenwinkel2003Methods.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i16} }
@article{Beerenwinkel2001Geno2pheno, author = {Beerenwinkel, N. and Schmidt, B. and Walter, H. and Kaiser, R. and Lengauer, T. and Hoffman, D. and Korn, K. and Selbig, J.}, title = {{{G}eno2pheno: {I}nterpreting {G}enotypic {HIV} {D}rug {R}esistance {T}ests}}, journal = {I{EEE} {I}ntelligent {S}ystems}, year = {2001}, volume = {6}, pages = {35-41}, number = {6}, abstract = {Rapid accumulation of resistance mutations in the genome of the human immunodeficiency virus ({HIV}) plays a central role in drug treatment failure in infected patients. {T}he authors have developed geno2pheno, an intelligent system that uses the information encoded in the viral genomic sequence to predict resistance or susceptibility of the virus to 13 antiretroviral agents. {T}o predict phenotypic drug resistance from genotype, they applied two machine learning techniques: decision trees and linear support vector machines. {T}hese techniques performed learning on more than 400 genotype-phenotype pairs for each drug. {T}he authors compared the generalization performance of the two families of models in leave-one-out experiments. {E}xcept for three drugs, all error estimates ranged between 7.25 and 15.5 percent. {S}upport vector machines performed slightly better for most drugs, but knowledge extraction was easier for decision trees. {G}eno2pheno is freely available at http://cartan.gmd.de/geno2pheno.html.}, doi = {10.1109/5254.972080}, pdf = {../local/Beerenwinkel2001Geno2pheno.pdf}, file = {Beerenwinkel2001Geno2pheno.pdf:local/Beerenwinkel2001Geno2pheno.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1109/5254.972080} }
@article{Ben-Dor2000Tissue, author = {Ben-Dor, A. and Bruhn, L. and Friedman, N. and Nachman, I. and Schummer, M. and Yakhini, Z.}, title = {Tissue Classification with Gene Expression Profiles}, journal = {J. Comput. Biol.}, year = {2000}, volume = {7}, pages = {559-583}, number = {3-4}, abstract = {Constantly improving gene expression profiling technologies are expected to provide understanding and insight into cancer-related cellular processes. {G}ene expression data is also expected to significantly aid in the development of efficient cancer diagnosis and classification platforms. {I}n this work we examine three sets of gene expression data measured across sets of tumor(s) and normal clinical samples: {T}he first set consists of 2,000 genes, measured in 62 epithelial colon samples ({A}lon et al., 1999). {T}he second consists of approximately equal to 100,000 clones, measured in 32 ovarian samples (unpublished extension of data set described in {S}chummer et al. (1999)). {T}he third set consists of approximately equal to 7,100 genes, measured in 72 bone marrow and peripheral blood samples ({G}olub et al, 1999). {W}e examine the use of scoring methods, measuring separation of tissue type (e.g., tumors from normals) using individual gene expression levels. {T}hese are then coupled with high-dimensional classification methods to assess the classification power of complete expression profiles. {W}e present results of performing leave-one-out cross validation ({LOOCV}) experiments on the three data sets, employing nearest neighbor classifier, {SVM} ({C}ortes and {V}apnik, 1995), {A}da{B}oost ({F}reund and {S}chapire, 1997) and a novel clustering-based classification technique. {A}s tumor samples can differ from normal samples in their cell-type composition, we also perform {LOOCV} experiments using appropriately modified sets of genes, attempting to eliminate the resulting bias. {W}e demonstrate success rate of at least 90% in tumor versus normal classification, using sets of selected genes, with, as well as without, cellular-contamination-related members. {T}hese results are insensitive to the exact selection mechanism, over a certain range.}, pdf = {../local/Ben-Dor2000Tissue.pdf}, file = {Ben-Dor2000Tissue.pdf:local/Ben-Dor2000Tissue.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://www.liebertonline.com/doi/abs/10.1089/106652700750050943} }
@article{Ben-Hur2003Remote, author = {Ben-Hur, A. and Brutlag, D.}, title = {Remote homology detection: a motif based approach}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {i26-i33}, number = {Suppl. 1}, abstract = {Motivation: {R}emote homology detection is the problem of detecting homology in cases of low sequence similarity. {I}t is a hard computational problem with no approach that works well in all cases. {R}esults: {W}e present a method for detecting remote homology that is based on the presence of discrete sequence motifs. {T}he motif content of a pair of sequences is used to define a similarity that is used as a kernel for a {S}upport {V}ector {M}achine ({SVM}) classifier. {W}e test the method on two remote homology detection tasks: prediction of a previously unseen {SCOP} family and prediction of an enzyme class given other enzymes that have a similar function on other substrates. {W}e find that it performs significantly better than an {SVM} method that uses {BLAST} or {S}mith-{W}aterman similarity scores as features. {A}vailability: {T}he software is available from the authors upon request.}, pdf = {../local/Ben-Hur2003Remote.pdf}, file = {Ben-Hur2003Remote.pdf:local/Ben-Hur2003Remote.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i26} }
@article{Ben-Hur2005Kernel, author = {Ben-Hur, A. and Noble, W. S.}, title = {Kernel methods for predicting protein-protein interactions.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i38-i46}, number = {Suppl. 1}, month = {Jun}, abstract = {M{OTIVATION}: {D}espite advances in high-throughput methods for discovering protein-protein interactions, the interaction networks of even well-studied model organisms are sketchy at best, highlighting the continued need for computational methods to help direct experimentalists in the search for novel interactions. {RESULTS}: {W}e present a kernel method for predicting protein-protein interactions using a combination of data sources, including protein sequences, {G}ene {O}ntology annotations, local properties of the network, and homologous interactions in other species. {W}hereas protein kernels proposed in the literature provide a similarity between single proteins, prediction of interactions requires a kernel between pairs of proteins. {W}e propose a pairwise kernel that converts a kernel between single proteins into a kernel between pairs of proteins, and we illustrate the kernel's effectiveness in conjunction with a support vector machine classifier. {F}urthermore, we obtain improved performance by combining several sequence-based kernels based on k-mer frequency, motif and domain content and by further augmenting the pairwise sequence kernel with features that are based on other sources of data.{W}e apply our method to predict physical interactions in yeast using data from the {BIND} database. {A}t a false positive rate of 1\% the classifier retrieves close to 80\% of a set of trusted interactions. {W}e thus demonstrate the ability of our method to make accurate predictions despite the sizeable fraction of false positives that are known to exist in interaction databases. {AVAILABILITY}: {T}he classification experiments were performed using {P}y{ML} available at http://pyml.sourceforge.net. {D}ata are available at: http://noble.gs.washington.edu/proj/sppi {CONTACT}: asa@gs.washington.edu.}, doi = {10.1093/bioinformatics/bti1016}, pdf = {../local/Ben-Hur2005Kernel.pdf}, file = {Ben-Hur2005Kernel.pdf:local/Ben-Hur2005Kernel.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i38}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1016} }
@article{Bern2004Automatic, author = {Bern, M. and Goldberg, D. and McDonald, W. H. and Yates, J. R., III}, title = {Automatic {Q}uality {A}ssessment of {P}eptide {T}andem {M}ass {S}pectra}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {i49-i54}, number = {Suppl. 1}, abstract = {Motivation: {A} powerful proteomics methodology couples high-performance liquid chromatography ({HPLC}) with tandem mass spectrometry and database-search software, such as {SEQUEST}. {S}uch a set-up, however, produces a large number of spectra, many of which are of too poor quality to be useful. {H}ence a filter that eliminates poor spectra before the database search can significantly improve throughput and robustness. {M}oreover, spectra judged to be of high quality, but that cannot be identified by database search, are prime candidates for still more computationally intensive methods, such as de novo sequencing or wider database searches including post-translational modifications. {R}esults: {W}e report on two different approaches to assessing spectral quality prior to identification: binary classification, which predicts whether or not {SEQUEST} will be able to make an identification, and statistical regression, which predicts a more universal quality metric involving the number of b- and y-ion peaks. {T}he best of our binary classifiers can eliminate over 75% of the unidentifiable spectra while losing only 10% of the identifiable spectra. {S}tatistical regression can pick out spectra of modified peptides that can be identified by a de novo program but not by {SEQUEST}. {I}n a section of independent interest, we discuss intensity normalization of mass spectra.}, pdf = {../local/Bern2004Automatic.pdf}, file = {Bern2004Automatic.pdf:local/Bern2004Automatic.pdf:PDF}, keywords = {biosvm proteomics}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i49} }
@article{Bhasin2005GPCRsclass, author = {Bhasin, M. and Raghava, G. P. S.}, title = {G{PCR}sclass: a web tool for the classification of amine type of {G}-protein-coupled receptors.}, journal = {Nucleic {A}cids {R}es.}, year = {2005}, volume = {33}, pages = {W143-7}, number = {Web Server issue}, month = {Jul}, abstract = {The receptors of amine subfamily are specifically major drug targets for therapy of nervous disorders and psychiatric diseases. {T}he recognition of novel amine type of receptors and their cognate ligands is of paramount interest for pharmaceutical companies. {I}n the past, {C}hou and co-workers have shown that different types of amine receptors are correlated with their amino acid composition and are predictable on its basis with considerable accuracy [{E}lrod and {C}hou (2002) {P}rotein {E}ng., 15, 713-715]. {T}his motivated us to develop a better method for the recognition of novel amine receptors and for their further classification. {T}he method was developed on the basis of amino acid composition and dipeptide composition of proteins using support vector machine. {T}he method was trained and tested on 167 proteins of amine subfamily of {G}-protein-coupled receptors ({GPCR}s). {T}he method discriminated amine subfamily of {GPCR}s from globular proteins with {M}atthew's correlation coefficient of 0.98 and 0.99 using amino acid composition and dipeptide composition, respectively. {I}n classifying different types of amine receptors using amino acid composition and dipeptide composition, the method achieved an accuracy of 89.8 and 96.4\%, respectively. {T}he performance of the method was evaluated using 5-fold cross-validation. {T}he dipeptide composition based method predicted 67.6\% of protein sequences with an accuracy of 100\% with a reliability index > or =5. {A} web server {GPCR}sclass has been developed for predicting amine-binding receptors from its amino acid sequence [http://www.imtech.res.in/raghava/gpcrsclass/ and http://bioinformatics.uams.edu/raghava/gpersclass/ (mirror site)].}, doi = {10.1093/nar/gki351}, pdf = {../local/Bhasin2005GPCRsclass.pdf}, file = {Bhasin2005GPCRsclass.pdf:local/Bhasin2005GPCRsclass.pdf:PDF}, keywords = {biosvm}, pii = {33/suppl_2/W143}, url = {http://dx.doi.org/10.1093/nar/gki351} }
@article{Bhasin2005Pcleavage, author = {Bhasin, M. and Raghava, G. P. S.}, title = {Pcleavage: an {SVM} based method for prediction of constitutive proteasome and immunoproteasome cleavage sites in antigenic sequences.}, journal = {Nucleic {A}cids {R}es}, year = {2005}, volume = {33}, pages = {W202-7}, number = {Web Server issue}, month = {Jul}, abstract = {This manuscript describes a support vector machine based method for the prediction of constitutive as well as immunoproteasome cleavage sites in antigenic sequences. {T}his method achieved {M}atthew's correlation coefficents of 0.54 and 0.43 on in vitro and major histocompatibility complex ligand data, respectively. {T}his shows that the performance of our method is comparable to that of the {N}et{C}hop method, which is currently considered to be the best method for proteasome cleavage site prediction. {B}ased on the method, a web server, {P}cleavage, has also been developed. {T}his server accepts protein sequences in any standard format and present results in a user-friendly format. {T}he server is available for free use by all academic users at the {URL} http://www.imtech.res.in/raghava/pcleavage/ or http://bioinformatics.uams.edu/mirror/pcleavage/.}, doi = {10.1093/nar/gki587}, pdf = {../local/Bhasin2005Pcleavage.pdf}, file = {Bhasin2005Pcleavage.pdf:local/Bhasin2005Pcleavage.pdf:PDF}, keywords = {biosvm immunoinformatics}, url = {http://dx.doi.org/10.1093/nar/gki587} }
@article{Bhasin2004Analysis, author = {Bhasin, M. and Raghava, G. P. S.}, title = {Analysis and prediction of affinity of {TAP} binding peptides using cascade {SVM}}, journal = {Protein {S}ci.}, year = {2004}, volume = {13}, pages = {596-607}, number = {3}, month = {Mar}, abstract = {The generation of cytotoxic {T} lymphocyte ({CTL}) epitopes from an antigenic sequence involves number of intracellular processes, including production of peptide fragments by proteasome and transport of peptides to endoplasmic reticulum through transporter associated with antigen processing ({TAP}). {I}n this study, 409 peptides that bind to human {TAP} transporter with varying affinity were analyzed to explore the selectivity and specificity of {TAP} transporter. {T}he abundance of each amino acid from {P}1 to {P}9 positions in high-, intermediate-, and low-affinity {TAP} binders were examined. {T}he rules for predicting {TAP} binding regions in an antigenic sequence were derived from the above analysis. {T}he quantitative matrix was generated on the basis of contribution of each position and residue in binding affinity. {T}he correlation of r = 0.65 was obtained between experimentally determined and predicted binding affinity by using a quantitative matrix. {F}urther a support vector machine ({SVM})-based method has been developed to model the {TAP} binding affinity of peptides. {T}he correlation (r = 0.80) was obtained between the predicted and experimental measured values by using sequence-based {SVM}. {T}he reliability of prediction was further improved by cascade {SVM} that uses features of amino acids along with sequence. {A}n extremely good correlation (r = 0.88) was obtained between measured and predicted values, when the cascade {SVM}-based method was evaluated through jackknife testing. {A} {W}eb service, {TAPP}red (http://www.imtech.res.in/raghava/tappred/ or http://bioinformatics.uams.edu/mirror/tappred/), has been developed based on this approach.}, doi = {10.1110/ps.03373104}, pdf = {../local/Bhasin2004Analysis.pdf}, file = {Bhasin2004Analysis.pdf:local/Bhasin2004Analysis.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1110/ps.03373104} }
@article{Bhasin2004Classification, author = {Bhasin, M. and Raghava, G. P. S.}, title = {Classification of {N}uclear {R}eceptors {B}ased on {A}mino {A}cid {C}omposition and {D}ipeptide {C}omposition}, journal = {J. {B}iol. {C}hem.}, year = {2004}, volume = {279}, pages = {23262-23266}, number = {22}, abstract = {Nuclear receptors are key transcription factors that regulate crucial gene networks responsible for cell growth, differentiation, and homeostasis. {N}uclear receptors form a superfamily of phylogenetically related proteins and control functions associated with major diseases (e.g. diabetes, osteoporosis, and cancer). {I}n this study, a novel method has been developed for classifying the subfamilies of nuclear receptors. {T}he classification was achieved on the basis of amino acid and dipeptide composition from a sequence of receptors using support vector machines. {T}he training and testing was done on a non-redundant data set of 282 proteins obtained from the {N}uclea{RDB} data base (1). {T}he performance of all classifiers was evaluated using a 5-fold cross validation test. {I}n the 5-fold cross-validation, the data set was randomly partitioned into five equal sets and evaluated five times on each distinct set while keeping the remaining four sets for training. {I}t was found that different subfamilies of nuclear receptors were quite closely correlated in terms of amino acid composition as well as dipeptide composition. {T}he overall accuracy of amino acid composition-based and dipeptide compositionbased classifiers were 82.6 and 97.5%, respectively. {T}herefore, our results prove that different subfamilies of nuclear receptors are predictable with considerable accuracy using amino acid or dipeptide composition. {F}urthermore, based on above approach, an online web service, {NR}pred, was developed, which is available at www.imtech.res.in/raghava/nrpred.}, doi = {10.1074/jbc.M401932200}, eprint = {http://www.jbc.org/cgi/reprint/279/22/23262.pdf}, pdf = {../local/Bhasin2004Classification.pdf}, file = {Bhasin2004Classification.pdf:local/Bhasin2004Classification.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1074/jbc.M401932200} }
@article{Bhasin2004ESLpred, author = {Bhasin, M. and Raghava, G. P. S.}, title = {{{ESL}pred: {SVM}}-based method for subcellular localization of eukaryotic proteins using dipeptide composition and {{PSI}-{BLAST}}}, journal = {Nucl. {A}cids {R}es.}, year = {2004}, volume = {32}, pages = {W414-419}, number = {Suppl. 2}, abstract = {Automated prediction of subcellular localization of proteins is an important step in the functional annotation of genomes. {T}he existing subcellular localization prediction methods are based on either amino acid composition or {N}-terminal characteristics of the proteins. {I}n this paper, support vector machine ({SVM}) has been used to predict the subcellular location of eukaryotic proteins from their different features such as amino acid composition, dipeptide composition and physico-chemical properties. {T}he {SVM} module based on dipeptide composition performed better than the {SVM} modules based on amino acid composition or physico-chemical properties. {I}n addition, {PSI}-{BLAST} was also used to search the query sequence against the dataset of proteins (experimentally annotated proteins) to predict its subcellular location. {I}n order to improve the prediction accuracy, we developed a hybrid module using all features of a protein, which consisted of an input vector of 458 dimensions (400 dipeptide compositions, 33 properties, 20 amino acid compositions of the protein and 5 from {PSI}-{BLAST} output). {U}sing this hybrid approach, the prediction accuracies of nuclear, cytoplasmic, mitochondrial and extracellular proteins reached 95.3, 85.2, 68.2 and 88.9%, respectively. {T}he overall prediction accuracy of {SVM} modules based on amino acid composition, physico-chemical properties, dipeptide composition and the hybrid approach was 78.1, 77.8, 82.9 and 88.0%, respectively. {T}he accuracy of all the modules was evaluated using a 5-fold cross-validation technique. {A}ssigning a reliability index (reliability index > or =3), 73.5% of prediction can be made with an accuracy of 96.4%. {B}ased on the above approach, an online web server {ESL}pred was developed, which is available at http://www.imtech.res.in/raghava/eslpred/.}, doi = {10.1093/nar/gkh350}, eprint = {http://nar.oupjournals.org/cgi/reprint/32/suppl_2/W414.pdf}, pdf = {../local/Bhasin2004ESLpred.pdf}, file = {Bhasin2004ESLpred.pdf:local/Bhasin2004ESLpred.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://nar.oupjournals.org/cgi/content/abstract/32/suppl_2/W414} }
@article{Bhasin2004GPCRpred, author = {Bhasin, M. and Raghava, G. P. S.}, title = {{{GPCR}pred}: an {SVM}-based method for prediction of families and subfamilies of {G}-protein coupled receptors}, journal = {Nucl. {A}cids {R}es.}, year = {2004}, volume = {32}, pages = {W383-389}, number = {Supp.2}, abstract = {G-protein coupled receptors ({GPCR}s) belong to one of the largest superfamilies of membrane proteins and are important targets for drug design. {I}n this study, a support vector machine ({SVM})-based method, {GPCR}pred, has been developed for predicting families and subfamilies of {GPCR}s from the dipeptide composition of proteins. {T}he dataset used in this study for training and testing was obtained from http://www.soe.ucsc.edu/research/compbio/gpcr/. {T}he method classified {GPCR}s and non-{GPCR}s with an accuracy of 99.5% when evaluated using 5-fold cross-validation. {T}he method is further able to predict five major classes or families of {GPCR}s with an overall {M}atthew's correlation coefficient ({MCC}) and accuracy of 0.81 and 97.5% respectively. {I}n recognizing the subfamilies of the rhodopsin-like family, the method achieved an average {MCC} and accuracy of 0.97 and 97.3% respectively. {T}he method achieved overall accuracy of 91.3% and 96.4% at family and subfamily level respectively when evaluated on an independent/blind dataset of 650 {GPCR}s. {A} server for recognition and classification of {GPCR}s based on multiclass {SVM}s has been set up at http://www.imtech.res.in/raghava/gpcrpred/. {W}e have also suggested subfamilies for 42 sequences which were previously identified as unclassified {C}lass{A} {GPCR}s. {T}he supplementary information is available at http://www.imtech.res.in/raghava/gpcrpred/info.html.}, doi = {10.1093/nar/gkh416}, eprint = {http://nar.oupjournals.org/cgi/reprint/32/suppl_2/W383.pdf}, pdf = {../local/Bhasin2004GPCRpred.pdf}, file = {Bhasin2004GPCRpred.pdf:local/Bhasin2004GPCRpred.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/nar/gkh416} }
@article{Bhasin2004Prediction, author = {Bhasin, M. and Raghava, G. P. S.}, title = {Prediction of {CTL} epitopes using {QM}, {SVM} and {ANN} techniques}, journal = {Vaccine}, year = {2004}, volume = {22}, pages = {3195-3204}, number = {23-24}, abstract = {Cytotoxic {T} lymphocyte ({CTL}) epitopes are potential candidates for subunit vaccine design for various diseases. {M}ost of the existing {T} cell epitope prediction methods are indirect methods that predict {MHC} class {I} binders instead of {CTL} epitopes. {I}n this study, a systematic attempt has been made to develop a direct method for predicting {CTL} epitopes from an antigenic sequence. {T}his method is based on quantitative matrix ({QM}) and machine learning techniques such as {S}upport {V}ector {M}achine ({SVM}) and {A}rtificial {N}eural {N}etwork ({ANN}). {T}his method has been trained and tested on non-redundant dataset of {T} cell epitopes and non-epitopes that includes 1137 experimentally proven {MHC} class {I} restricted {T} cell epitopes. {T}he accuracy of {QM}-, {ANN}- and {SVM}-based methods was 70.0, 72.2 and 75.2%, respectively. {T}he performance of these methods has been evaluated through {L}eave {O}ne {O}ut {C}ross-{V}alidation ({LOOCV}) at a cutoff score where sensitivity and specificity was nearly equal. {F}inally, both machine-learning methods were used for consensus and combined prediction of {CTL} epitopes. {T}he performances of these methods were evaluated on blind dataset where machine learning-based methods perform better than {QM}-based method. {W}e also demonstrated through subgroup analysis that our methods can discriminate between {T}-cell epitopes and {MHC} binders (non-epitopes). {I}n brief this method allows prediction of {CTL} epitopes using {QM}, {SVM}, {ANN} approaches. {T}he method also facilitates prediction of {MHC} restriction in predicted {T} cell epitopes.}, doi = {10.1016/j.vaccine.2004.02.005}, pdf = {../local/Bhasin2004Prediction.pdf}, file = {Bhasin2004Prediction.pdf:local/Bhasin2004Prediction.pdf:PDF}, keywords = {biosvm immunoinformatics}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.vaccine.2004.02.005} }
@article{Bhasin2004SVM, author = {Bhasin, M. and Raghava, G. P. S.}, title = {S{VM} based method for predicting {{HLA}-{DRB}1*0401} binding peptides in an antigen sequence}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {421-423}, number = {3}, abstract = {Summary: {P}rediction of peptides binding with {MHC} class {II} allele {HLA}-{DRB}1*0401 can effectively reduce the number of experiments required for identifying helper {T} cell epitopes. {T}his paper describes support vector machine ({SVM}) based method developed for identifying {HLA}-{DRB}1*0401 binding peptides in an antigenic sequence. {SVM} was trained and tested on large and clean data set consisting of 567 binders and equal number of non-binders. {T}he accuracy of the method was 86% when evaluated through 5-fold cross-validation technique. {A}vailable: {A} web server {HLA}-{DR}4{P}red based on above approach is available at http://www.imtech.res.in/raghava/hladr4pred/ and http://bioinformatics.uams.edu/mirror/hladr4pred/ ({M}irror {S}ite). {S}upplementary information: http://www.imtech.res.in/raghava/hladr4pred/info.html}, pdf = {../local/Bhasin2004SVM.pdf}, file = {Bhasin2004SVM.pdf:local/Bhasin2004SVM.pdf:PDF}, keywords = {biosvm immunoinformatics}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/3/421} }
@article{Bock2003Whole-proteome, author = {Bock, J. R. and Gough, D. A.}, title = {Whole-proteome interaction mining}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {125-134}, number = {1}, abstract = {Motivation: {A} major post-genomic scientific and technological pursuit is to describe the functions performed by the proteins encoded by the genome. {O}ne strategy is to first identify the protein-protein interactions in a proteome, then determine pathways and overall structure relating these interactions, and finally to statistically infer functional roles of individual proteins. {A}lthough huge amounts of genomic data are at hand, current experimental protein interaction assays must overcome technical problems to scale-up for high-throughput analysis. {I}n the meantime, bioinformatics approaches may help bridge the information gap required for inference of protein function. {I}n this paper, a previously described data mining approach to prediction of protein-protein interactions ({B}ock and {G}ough, 2001, {B}ioinformatics, 17, 455-460) is extended to interaction mining on a proteome-wide scale. {A}n algorithm (the phylogenetic bootstrap) is introduced, which suggests traversal of a phenogram, interleaving rounds of computation and experiment, to develop a knowledge base of protein interactions in genetically-similar organisms. {R}esults: {T}he interaction mining approach was demonstrated by building a learning system based on 1,039 experimentally validated protein-protein interactions in the human gastric bacterium {H}elicobacter pylori. {A}n estimate of the generalization performance of the classifier was derived from 10-fold cross-validation, which indicated expected upper bounds on precision of 80% and sensitivity of 69% when applied to related organisms. {O}ne such organism is the enteric pathogen {C}ampylobacter jejuni, in which comprehensive machine learning prediction of all possible pairwise protein-protein interactions was performed. {T}he resulting network of interactions shares an average protein connectivity characteristic in common with previous investigations reported in the literature, offering strong evidence supporting the biological feasibility of the hypothesized map. {F}or inferences about complete proteomes in which the number of pairwise non-interactions is expected to be much larger than the number of actual interactions, we anticipate that the sensitivity will remain the same but precision may decrease. {W}e present specific biological examples of two subnetworks of protein-protein interactions in {C}. jejuni resulting from the application of this approach, including elements of a two-component signal transduction systems for thermoregulation, and a ferritin uptake network. {C}ontact: dgough@bioeng.ucsd.edu}, pdf = {../local/Bock2003Whole-proteome.pdf}, file = {Bock2003Whole-proteome.pdf:local/Bock2003Whole-proteome.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/1/125} }
@article{Bock2002New, author = {Bock, J. R. and Gough, D. A.}, title = {A {N}ew {M}ethod to {E}stimate {L}igand-{R}eceptor {E}nergetics}, journal = {Mol {C}ell {P}roteomics}, year = {2002}, volume = {1}, pages = {904-910}, number = {11}, abstract = {In the discovery of new drugs, lead identification and optimization have assumed critical importance given the number of drug targets generated from genetic, genomics, and proteomic technologies. {H}igh-throughput experimental screening assays have been complemented recently by "virtual screening" approaches to identify and filter potential ligands when the characteristics of a target receptor structure of interest are known. {V}irtual screening mandates a reliable procedure for automatic ranking of structurally distinct ligands in compound library databases. {C}omputing a rank score requires the accurate prediction of binding affinities between these ligands and the target. {M}any current scoring strategies require information about the target three-dimensional structure. {I}n this study, a new method to estimate the free binding energy between a ligand and receptor is proposed. {W}e extend a central idea previously reported ({B}ock, {J}. {R}., and {G}ough, {D}. {A}. (2001) {P}redicting protein-protein interactions from primary structure. {B}ioinformatics 17, 455-460; {B}ock, {J}. {R}., and {G}ough, {D}. {A}. (2002) {W}hole-proteome interaction mining. {B}ioinformatics, in press) that uses simple descriptors to represent biomolecules as input examples to train a support vector machine ({S}mola, {A}. {J}., and {S}cholkopf, {B}. (1998) {A} {T}utorial on {S}upport {V}ector {R}egression, {N}euro{COLT} {T}echnical {R}eport {NC}-{TR}-98-030, {R}oyal {H}olloway {C}ollege, {U}niversity of {L}ondon, {UK}) and the application of the trained system to previously unseen pairs, estimating their propensity for interaction. {H}ere we seek to learn the function that maps features of a receptor-ligand pair onto their equilibrium free binding energy. {T}hese features do not comprise any direct information about the three-dimensional structures of ligand or target. {I}n cross-validation experiments, it is demonstrated that objective measurements of prediction error rate and rank-ordering statistics are competitive with those of several other investigations, most of which depend on three-dimensional structural data. {T}he size of the sample (n = 2,671) indicates that this approach is robust and may have widespread applicability beyond restricted families of receptor types. {I}t is concluded that newly sequenced proteins, or those for which three-dimensional crystal structures are not easily obtained, can be rapidly analyzed for their binding potential against a library of ligands using this methodology.}, pdf = {../local/Bock2002New.pdf}, file = {Bock2002New.pdf:local/Bock2002New.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.mcponline.org/cgi/content/abstract/1/11/904} }
@article{Bock2001Predicting, author = {Bock, J. R. and Gough, D. A.}, title = {Predicting protein-protein interactions from primary structure}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {455--460}, number = {5}, pdf = {../local/bock01.pdf}, file = {bock01.pdf:local/bock01.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://bioinformatics.oupjournals.org/cgi/reprint/17/5/455.pdf} }
@article{Bordner2005Statistical, author = {Andrew J Bordner and Ruben Abagyan}, title = {Statistical analysis and prediction of protein-protein interfaces.}, journal = {Proteins}, year = {2005}, volume = {60}, pages = {353-66}, number = {3}, month = {Aug}, abstract = {Predicting protein-protein interfaces from a three-dimensional structure is a key task of computational structural proteomics. {I}n contrast to geometrically distinct small molecule binding sites, protein-protein interface are notoriously difficult to predict. {W}e generated a large nonredundant data set of 1494 true protein-protein interfaces using biological symmetry annotation where necessary. {T}he data set was carefully analyzed and a {S}upport {V}ector {M}achine was trained on a combination of a new robust evolutionary conservation signal with the local surface properties to predict protein-protein interfaces. {F}ivefold cross validation verifies the high sensitivity and selectivity of the model. {A}s much as 97\% of the predicted patches had an overlap with the true interface patch while only 22\% of the surface residues were included in an average predicted patch. {T}he model allowed the identification of potential new interfaces and the correction of mislabeled oligomeric states.}, doi = {10.1002/prot.20433}, pdf = {../local/Bordner2005Statistical.pdf}, file = {Bordner2005Statistical.pdf:local/Bordner2005Statistical.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1002/prot.20433} }
@article{Borgwardt2005Protein, author = {Borgwardt, K.M. and Ong, C.S. and Sch{\"o}nauer, S. and Vishwanathan, S.V.N. and Smola, A.J. and Kriegel, H.-P.}, title = {Protein function prediction via graph kernels.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i47-i56}, number = {Suppl. 1}, month = {Jun}, abstract = {M{OTIVATION}: {C}omputational approaches to protein function prediction infer protein function by finding proteins with similar sequence, structure, surface clefts, chemical properties, amino acid motifs, interaction partners or phylogenetic profiles. {W}e present a new approach that combines sequential, structural and chemical information into one graph model of proteins. {W}e predict functional class membership of enzymes and non-enzymes using graph kernels and support vector machine classification on these protein graphs. {RESULTS}: {O}ur graph model, derivable from protein sequence and structure only, is competitive with vector models that require additional protein information, such as the size of surface pockets. {I}f we include this extra information into our graph model, our classifier yields significantly higher accuracy levels than the vector models. {H}yperkernels allow us to select and to optimally combine the most relevant node attributes in our protein graphs. {W}e have laid the foundation for a protein function prediction system that integrates protein information from various sources efficiently and effectively. {AVAILABILITY}: {M}ore information available via www.dbs.ifi.lmu.de/{M}itarbeiter/borgwardt.html. {CONTACT}: borgwardt@dbs.ifi.lmu.de.}, doi = {10.1093/bioinformatics/bti1007}, pdf = {../local/Borgwardt2005Protein.pdf}, file = {Borgwardt2005Protein.pdf:local/Borgwardt2005Protein.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i47}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1007} }
@article{Bradford2005Improved, author = {James R Bradford and David R Westhead}, title = {Improved prediction of protein-protein binding sites using a support vector machines approach.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1487-94}, number = {8}, month = {Apr}, abstract = {M{OTIVATION}: {S}tructural genomics projects are beginning to produce protein structures with unknown function, therefore, accurate, automated predictors of protein function are required if all these structures are to be properly annotated in reasonable time. {I}dentifying the interface between two interacting proteins provides important clues to the function of a protein and can reduce the search space required by docking algorithms to predict the structures of complexes. {RESULTS}: {W}e have combined a support vector machine ({SVM}) approach with surface patch analysis to predict protein-protein binding sites. {U}sing a leave-one-out cross-validation procedure, we were able to successfully predict the location of the binding site on 76\% of our dataset made up of proteins with both transient and obligate interfaces. {W}ith heterogeneous cross-validation, where we trained the {SVM} on transient complexes to predict on obligate complexes (and vice versa), we still achieved comparable success rates to the leave-one-out cross-validation suggesting that sufficient properties are shared between transient and obligate interfaces. {AVAILABILITY}: {A} web application based on the method can be found at http://www.bioinformatics.leeds.ac.uk/ppi_pred. {T}he dataset of 180 proteins used in this study is also available via the same web site. {CONTACT}: westhead@bmb.leeds.ac.uk {SUPPLEMENTARY} {INFORMATION}: http://www.bioinformatics.leeds.ac.uk/ppi-pred/supp-material.}, doi = {10.1093/bioinformatics/bti242}, pdf = {../local/Bradford2005Improved.pdf}, file = {Bradford2005Improved.pdf:local/Bradford2005Improved.pdf:PDF}, keywords = {biosvm}, pii = {bti242}, url = {http://dx.doi.org/10.1093/bioinformatics/bti242} }
@article{Briem2005Classifying, author = {Hans Briem and Judith G{\"u}nther}, title = {Classifying "kinase inhibitor-likeness" by using machine-learning methods.}, journal = {ChemBioChem}, year = {2005}, volume = {6}, pages = {558-66}, number = {3}, month = {Mar}, abstract = {By using an in-house data set of small-molecule structures, encoded by {G}hose-{C}rippen parameters, several machine learning techniques were applied to distinguish between kinase inhibitors and other molecules with no reported activity on any protein kinase. {A}ll four approaches pursued--support-vector machines ({SVM}), artificial neural networks ({ANN}), k nearest neighbor classification with {GA}-optimized feature selection ({GA}/k{NN}), and recursive partitioning ({RP})--proved capable of providing a reasonable discrimination. {N}evertheless, substantial differences in performance among the methods were observed. {F}or all techniques tested, the use of a consensus vote of the 13 different models derived improved the quality of the predictions in terms of accuracy, precision, recall, and {F}1 value. {S}upport-vector machines, followed by the {GA}/k{NN} combination, outperformed the other techniques when comparing the average of individual models. {B}y using the respective majority votes, the prediction of neural networks yielded the highest {F}1 value, followed by {SVM}s.}, doi = {10.1002/cbic.200400109}, pdf = {../local/Briem2005Classifying.pdf}, file = {Briem2005Classifying.pdf:local/Briem2005Classifying.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1002/cbic.200400109} }
@article{Brown2000Knowledge-based, author = {Brown, M. P. and Grundy, W. N. and Lin, D. and Cristianini, N. and Sugnet, C. W. and Furey, T. S. and Ares, M. and Haussler, D.}, title = {Knowledge-based analysis of microarray gene expression data by using support vector machines.}, journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}}, year = {2000}, volume = {97}, pages = {262-7}, number = {1}, month = {Jan}, abstract = {We introduce a method of functionally classifying genes by using gene expression data from {DNA} microarray hybridization experiments. {T}he method is based on the theory of support vector machines ({SVM}s). {SVM}s are considered a supervised computer learning method because they exploit prior knowledge of gene function to identify unknown genes of similar function from expression data. {SVM}s avoid several problems associated with unsupervised clustering methods, such as hierarchical clustering and self-organizing maps. {SVM}s have many mathematical features that make them attractive for gene expression analysis, including their flexibility in choosing a similarity function, sparseness of solution when dealing with large data sets, the ability to handle large feature spaces, and the ability to identify outliers. {W}e test several {SVM}s that use different similarity metrics, as well as some other supervised learning methods, and find that the {SVM}s best identify sets of genes with a common function using expression data. {F}inally, we use {SVM}s to predict functional roles for uncharacterized yeast {ORF}s based on their expression data.}, pdf = {../local/Brown2000Knowledge-based.pdf}, file = {Brown2000Knowledge-based.pdf:local/Brown2000Knowledge-based.pdf:PDF}, keywords = {biosvm microarray}, url = {http://www.pnas.org/cgi/content/abstract/97/1/262} }
@article{Bunescu2005Comparative, author = {Bunescu, R. and Ge, R. and Kate, R. J. and Marcotte, E. M. and Mooney, R. J. and Ramani, A. K. and Wong, Y. W.}, title = {Comparative experiments on learning information extractors for proteins and their interactions.}, journal = {Artif. {I}ntell. {M}ed.}, year = {2005}, volume = {33}, pages = {139-55}, number = {2}, month = {Feb}, abstract = {O{BJECTIVE}: {A}utomatically extracting information from biomedical text holds the promise of easily consolidating large amounts of biological knowledge in computer-accessible form. {T}his strategy is particularly attractive for extracting data relevant to genes of the human genome from the 11 million abstracts in {M}edline. {H}owever, extraction efforts have been frustrated by the lack of conventions for describing human genes and proteins. {W}e have developed and evaluated a variety of learned information extraction systems for identifying human protein names in {M}edline abstracts and subsequently extracting information on interactions between the proteins. {METHODS} {AND} {MATERIAL}: {W}e used a variety of machine learning methods to automatically develop information extraction systems for extracting information on gene/protein name, function and interactions from {M}edline abstracts. {W}e present cross-validated results on identifying human proteins and their interactions by training and testing on a set of approximately 1000 manually-annotated {M}edline abstracts that discuss human genes/proteins. {RESULTS}: {W}e demonstrate that machine learning approaches using support vector machines and maximum entropy are able to identify human proteins with higher accuracy than several previous approaches. {W}e also demonstrate that various rule induction methods are able to identify protein interactions with higher precision than manually-developed rules. {CONCLUSION}: {O}ur results show that it is promising to use machine learning to automatically build systems for extracting information from biomedical text. {T}he results also give a broad picture of the relative strengths of a wide variety of methods when tested on a reasonably large human-annotated corpus.}, doi = {10.1016/j.artmed.2004.07.016}, pdf = {../local/Bunescu2005Comparative.pdf}, file = {Bunescu2005Comparative.pdf:local/Bunescu2005Comparative.pdf:PDF}, keywords = {biosvm}, pii = {S0933-3657(04)00131-9}, url = {http://dx.doi.org/10.1016/j.artmed.2004.07.016} }
@article{Burbidge2001Drug, author = {Burbidge, R. and Trotter, M. and Buxton, B. and Holden, S.}, title = {Drug design by machine learning: support vector machines for pharmaceutical data analysis}, journal = {Comput. {C}hem.}, year = {2001}, volume = {26}, pages = {4--15}, number = {1}, month = {December}, pdf = {../local/burb01.pdf}, file = {burb01.pdf:local/burb01.pdf:PDF}, keywords = {biosvm chemoinformatics}, subject = {qsar}, url = {http://stats.ma.ic.ac.uk/~rdb/pubs/candc-aisb00-rbmt-final.pdf} }
@article{Burckin2005Exploring, author = {Burckin, T. and Nagel, R. and Mandel-Gutfreund, Y. and Shiue, L. and Clark, T. A. and Chong, J.-L. and Chang, T.-H. and Squazzo, S. and Hartzog, G. and Ares, M.}, title = {Exploring functional relationships between components of the gene expression machinery.}, journal = {Nat. {S}truct. {M}ol. {B}iol.}, year = {2005}, volume = {12}, pages = {175-82}, number = {2}, month = {Feb}, abstract = {Eukaryotic gene expression requires the coordinated activity of many macromolecular machines including transcription factors and {RNA} polymerase, the spliceosome, m{RNA} export factors, the nuclear pore, the ribosome and decay machineries. {Y}east carrying mutations in genes encoding components of these machineries were examined using microarrays to measure changes in both pre-m{RNA} and m{RNA} levels. {W}e used these measurements as a quantitative phenotype to ask how steps in the gene expression pathway are functionally connected. {A} multiclass support vector machine was trained to recognize the gene expression phenotypes caused by these mutations. {I}n several cases, unexpected phenotype assignments by the computer revealed functional roles for specific factors at multiple steps in the gene expression pathway. {T}he ability to resolve gene expression pathway phenotypes provides insight into how the major machineries of gene expression communicate with each other.}, doi = {10.1038/nsmb891}, pdf = {../local/Burckin2005Exploring.pdf}, file = {Burckin2005Exploring.pdf:local/Burckin2005Exploring.pdf:PDF}, keywords = {biosvm microarray}, pii = {nsmb891}, url = {http://dx.doi.org/10.1038/nsmb891} }
@article{Busuttil2004Support, author = {Busuttil, S. and Abela, J. and Pace, G. J.}, title = {Support vector machines with profile-based kernels for remote protein homology detection.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2004}, volume = {15}, pages = {191-200}, number = {2}, abstract = {Two new techniques for remote protein homology detection particulary suited for sparse data are introduced. {T}hese methods are based on position specific scoring matrices or profiles and use a support vector machine ({SVM}) for discrimination. {T}he performance on standard benchmarks outperforms previous non-discriminative techniques and is comparable to that of other {SVM}-based methods while giving distinct advantages.}, pdf = {../local/Busuttil2004Support.pdf}, file = {Busuttil2004Support.pdf:local/Busuttil2004Support.pdf:PDF}, keywords = {biosvm}, url = {http://www.jsbi.org/journal/GIW04/GIW04F020.html} }
@article{Byvatov2003Comparison, author = {Byvatov, E. and Fechner, U. and Sadowski, J. and Schneider, G.}, title = {Comparison of support vector machine and artificial neural network systems for drug/nondrug classification.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {1882-9}, number = {6}, abstract = {Support vector machine ({SVM}) and artificial neural network ({ANN}) systems were applied to a drug/nondrug classification problem as an example of binary decision problems in early-phase virtual compound filtering and screening. {T}he results indicate that solutions obtained by {SVM} training seem to be more robust with a smaller standard error compared to {ANN} training. {G}enerally, the {SVM} classifier yielded slightly higher prediction accuracy than {ANN}, irrespective of the type of descriptors used for molecule encoding, the size of the training data sets, and the algorithm employed for neural network training. {T}he performance was compared using various different descriptor sets and descriptor combinations based on the 120 standard {G}hose-{C}rippen fragment descriptors, a wide range of 180 different properties and physicochemical descriptors from the {M}olecular {O}perating {E}nvironment ({MOE}) package, and 225 topological pharmacophore ({CATS}) descriptors. {F}or the complete set of 525 descriptors cross-validated classification by {SVM} yielded 82\% correct predictions ({M}atthews cc = 0.63), whereas {ANN} reached 80\% correct predictions ({M}atthews cc = 0.58). {A}lthough {SVM} outperformed the {ANN} classifiers with regard to overall prediction accuracy, both methods were shown to complement each other, as the sets of true positives, false positives (overprediction), true negatives, and false negatives (underprediction) produced by the two classifiers were not identical. {T}he theory of {SVM} and {ANN} training is briefly reviewed.}, doi = {10.1021/ci0341161}, pdf = {../local/Byvatov2003Comparison.pdf}, file = {Byvatov2003Comparison.pdf:local/Byvatov2003Comparison.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci0341161} }
@article{Byvatov2004SVM-based, author = {Evgeny Byvatov and Gisbert Schneider}, title = {S{VM}-based feature selection for characterization of focused compound collections.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {993-9}, number = {3}, abstract = {Artificial neural networks, the support vector machine ({SVM}), and other machine learning methods for the classification of molecules are often considered as a "black box", since the molecular features that are most relevant for a given classifier are usually not presented in a human-interpretable form. {W}e report on an {SVM}-based algorithm for the selection of relevant molecular features from a trained classifier that might be important for an understanding of ligand-receptor interactions. {T}he original {SVM} approach was extended to allow for feature selection. {T}he method was applied to characterize focused libraries of enzyme inhibitors. {A} comparison with classical {K}olmogorov-{S}mirnov ({KS})-based feature selection was performed. {I}n most of the applications the {SVM} method showed sustained classification accuracy, thereby relying on a smaller number of molecular features than {KS}-based classifiers. {I}n one case both methods produced comparable results. {L}imiting the calculation of descriptors to only the most relevant ones for a certain biological activity can also be used to speed up high-throughput virtual screening.}, doi = {10.1021/ci0342876}, pdf = {../local/Byvatov2004SVM-based.pdf}, file = {Byvatov2004SVM-based.pdf:local/Byvatov2004SVM-based.pdf:PDF}, keywords = {biosvm chemoinformatics featureselection}, url = {http://dx.doi.org/10.1021/ci0342876} }
@article{Byvatov2003Support, author = {E. Byvatov and G. Schneider}, title = {Support vector machine applications in bioinformatics.}, journal = {Appl {B}ioinformatics}, year = {2003}, volume = {2}, pages = {67-77}, number = {2}, abstract = {The support vector machine ({SVM}) approach represents a data-driven method for solving classification tasks. {I}t has been shown to produce lower prediction error compared to classifiers based on other methods like artificial neural networks, especially when large numbers of features are considered for sample description. {I}n this review, the theory and main principles of the {SVM} approach are outlined, and successful applications in traditional areas of bioinformatics research are described. {C}urrent developments in techniques related to the {SVM} approach are reviewed which might become relevant for future functional genomics and chemogenomics projects. {I}n a comparative study, we developed neural network and {SVM} models to identify small organic molecules that potentially modulate the function of {G}-protein coupled receptors. {T}he {SVM} system was able to correctly classify approximately 90\% of the compounds in a cross-validation study yielding a {M}atthews correlation coefficient of 0.78. {T}his classifier can be used for fast filtering of compound libraries in virtual screening applications.}, keywords = {biosvm} }
@article{Cai2004Enzyme, author = {Cai, C.Z. and Han, L.Y. and Ji, Z.L. and Chen, Y.Z.}, title = {Enzyme family classification by support vector machines.}, journal = {Proteins}, year = {2004}, volume = {55}, pages = {66-76}, number = {1}, abstract = {One approach for facilitating protein function prediction is to classify proteins into functional families. {R}ecent studies on the classification of {G}-protein coupled receptors and other proteins suggest that a statistical learning method, {S}upport vector machines ({SVM}), may be potentially useful for protein classification into functional families. {I}n this work, {SVM} is applied and tested on the classification of enzymes into functional families defined by the {E}nzyme {N}omenclature {C}ommittee of {IUBMB}. {SVM} classification system for each family is trained from representative enzymes of that family and seed proteins of {P}fam curated protein families. {T}he classification accuracy for enzymes from 46 families and for non-enzymes is in the range of 50.0% to 95.7% and 79.0% to 100% respectively. {T}he corresponding {M}atthews correlation coefficient is in the range of 54.1% to 96.1%. {M}oreover, 80.3% of the 8,291 correctly classified enzymes are uniquely classified into a specific enzyme family by using a scoring function, indicating that {SVM} may have certain level of unique prediction capability. {T}esting results also suggest that {SVM} in some cases is capable of classification of distantly related enzymes and homologous enzymes of different functions. {E}ffort is being made to use a more comprehensive set of enzymes as training sets and to incorporate multi-class {SVM} classification systems to further enhance the unique prediction accuracy. {O}ur results suggest the potential of {SVM} for enzyme family classification and for facilitating protein function prediction. {O}ur software is accessible at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.}, doi = {10.1002/prot.20045}, pdf = {../local/Cai2004Enzyme.pdf}, file = {Cai2004Enzyme.pdf:local/Cai2004Enzyme.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.20045} }
@article{Cai2003Protein, author = {Cai, C.Z. and Wang, W.L. and Sun, L.Z. and Chen, Y.Z.}, title = {Protein function classification via support vector machine approach.}, journal = {Math. {B}iosci.}, year = {2003}, volume = {185}, pages = {111-122}, number = {2}, abstract = {Support vector machine ({SVM}) is introduced as a method for the classification of proteins into functionally distinguished classes. {S}tudies are conducted on a number of protein classes including {RNA}-binding proteins; protein homodimers, proteins responsible for drug absorption, proteins involved in drug distribution and excretion, and drug metabolizing enzymes. {T}esting accuracy for the classification of these protein classes is found to be in the range of 84-96%. {T}his suggests the usefulness of {SVM} in the classification of protein functional classes and its potential application in protein function prediction.}, doi = {10.1016/S0025-5564(03)00096-8}, pdf = {../local/Cai2003Protein.pdf}, file = {Cai2003Protein.pdf:local/Cai2003Protein.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Cai2003SVM-Prot, author = {C. Z. Cai and L. Y. Han and Z. L. Ji and X. Chen and Y. Z. Chen}, title = {S{VM}-{P}rot: {W}eb-based support vector machine software for functional classification of a protein from its primary sequence.}, journal = {Nucleic {A}cids {R}es}, year = {2003}, volume = {31}, pages = {3692-7}, number = {13}, month = {Jul}, abstract = {Prediction of protein function is of significance in studying biological processes. {O}ne approach for function prediction is to classify a protein into functional family. {S}upport vector machine ({SVM}) is a useful method for such classification, which may involve proteins with diverse sequence distribution. {W}e have developed a web-based software, {SVMP}rot, for {SVM} classification of a protein into functional family from its primary sequence. {SVMP}rot classification system is trained from representative proteins of a number of functional families and seed proteins of {P}fam curated protein families. {I}t currently covers 54 functional families and additional families will be added in the near future. {T}he computed accuracy for protein family classification is found to be in the range of 69.1-99.6\%. {SVMP}rot shows a certain degree of capability for the classification of distantly related proteins and homologous proteins of different function and thus may be used as a protein function prediction tool that complements sequence alignment methods. {SVMP}rot can be accessed at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.}, pdf = {../local/Cai2003SVM-Prot.pdf}, file = {Cai2003SVM-Prot.pdf:local/Cai2003SVM-Prot.pdf:PDF}, keywords = {biosvm}, url = {http://nar.oxfordjournals.org/cgi/content/abstract/31/13/3692} }
@article{Cai2003Supportc, author = {Cai, Y.D. and Feng, K.Y. and Li, Y.X. and Chou, K.C.}, title = {Support vector machine for predicting alpha-turn types.}, journal = {Peptides}, year = {2003}, volume = {24}, pages = {629-630}, number = {4}, abstract = {Tight turns play an important role in globular proteins from both the structural and functional points of view. {O}f tight turns, beta-turns and gamma-turns have been extensively studied, but alpha-turns were little investigated. {R}ecently, a systematic search for alpha-turns classified alpha-turns into nine different types according to their backbone trajectory features. {I}n this paper, {S}upport {V}ector {M}achines ({SVM}s), a new machine learning method, is proposed for predicting the alpha-turn types in proteins. {T}he high rates of correct prediction imply that that the formation of different alpha-turn types is evidently correlated with the sequence of a pentapeptide, and hence can be approximately predicted based on the sequence information of the pentapeptide alone, although the incorporation of its interaction with the other part of a protein, the so-called "long distance interaction", will further improve the prediction quality.}, doi = {10.1016/S0196-9781(03)00100-1}, pdf = {../local/Cai2003Supportc.pdf}, file = {Cai2003Supportc.pdf:local/Cai2003Supportc.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0196-9781(03)00100-1} }
@article{Cai2003Supportd, author = {Cai, Y.D. and Lin, S.L.}, title = {Support vector machines for predicting r{RNA}-, {RNA}-, and {DNA}-binding proteins from amino acid sequence.}, journal = {Biochim. {B}iophys. {A}cta}, year = {2003}, volume = {1648}, pages = {127-133}, number = {1-2}, abstract = {Classification of gene function remains one of the most important and demanding tasks in the post-genome era. {M}ost of the current predictive computer methods rely on comparing features that are essentially linear to the protein sequence. {H}owever, features of a protein nonlinear to the sequence may also be predictive to its function. {M}achine learning methods, for instance the {S}upport {V}ector {M}achines ({SVM}s), are particularly suitable for exploiting such features. {I}n this work we introduce {SVM} and the pseudo-amino acid composition, a collection of nonlinear features extractable from protein sequence, to the field of protein function prediction. {W}e have developed prototype {SVM}s for binary classification of r{RNA}-, {RNA}-, and {DNA}-binding proteins. {U}sing a protein's amino acid composition and limited range correlation of hydrophobicity and solvent accessible surface area as input, each of the {SVM}s predicts whether the protein belongs to one of the three classes. {I}n self-consistency and cross-validation tests, which measures the success of learning and prediction, respectively, the r{RNA}-binding {SVM} has consistently achieved >95% accuracy. {T}he {RNA}- and {DNA}-binding {SVM}s demonstrate more diverse accuracy, ranging from approximately 76% to approximately 97%. {A}nalysis of the test results suggests the directions of improving the {SVM}s.}, doi = {10.1016/S1570-9639(03)00112-2}, pdf = {../local/Cai2003Supportd.pdf}, file = {Cai2003Supportd.pdf:local/Cai2003Supportd.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S1570-9639(03)00112-2} }
@article{Cai2003Supporta, author = {Cai, Y.D. and Lin, S.L. and Chou, K.C.}, title = {Support vector machines for prediction of protein signal sequences and their cleavage sites}, journal = {Peptides}, year = {2003}, volume = {24}, pages = {159-161}, number = {1}, abstract = {Given a nascent protein sequence, how can one predict its signal peptide or "{Z}ipcode" sequence? {T}his is an important problem for scientists to use signal peptides as a vehicle to find new drugs or to reprogram cells for gene therapy (see, e.g. [7] {K}.{C}. {C}hou, {C}urrent {P}rotein and {P}eptide {S}cience 2002;3:615?22). {I}n this paper, support vector machines ({SVM}s), a new machine learning method, is applied to approach this problem. {T}he overall rate of correct prediction for 1939 secretary proteins and 1440 nonsecretary proteins was over 91%. {I}t has not escaped our attention that the new method may also serve as a useful tool for further investigating many unclear details regarding the molecular mechanism of the {ZIP} code protein-sorting system in cells.}, doi = {10.1016/S0196-9781(02)00289-9}, pdf = {../local/Cai2003Supporta.pdf}, file = {Cai2003Supporta.pdf:local/Cai2003Supporta.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Cai2003Prediction, author = {Cai, Y.D. and Liu, X.J. and Li, Y.X. and Xu, X.B. and Chou, K.C.}, title = {Prediction of beta-turns with learning machines.}, journal = {Peptides}, year = {2003}, volume = {24}, pages = {665-669}, number = {5}, abstract = {The support vector machine approach was introduced to predict the beta-turns in proteins. {T}he overall self-consistency rate by the re-substitution test for the training or learning dataset reached 100%. {B}oth the training dataset and independent testing dataset were taken from {C}hou [{J}. {P}ept. {R}es. 49 (1997) 120]. {T}he success prediction rates by the jackknife test for the beta-turn subset of 455 tetrapeptides and non-beta-turn subset of 3807 tetrapeptides in the training dataset were 58.1 and 98.4%, respectively. {T}he success rates with the independent dataset test for the beta-turn subset of 110 tetrapeptides and non-beta-turn subset of 30,231 tetrapeptides were 69.1 and 97.3%, respectively. {T}he results obtained from this study support the conclusion that the residue-coupled effect along a tetrapeptide is important for the formation of a beta-turn.}, doi = {10.1016/S0196-9781(03)00133-5}, pdf = {../local/Cai2003Prediction.pdf}, file = {Cai2003Prediction.pdf:local/Cai2003Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0196-9781(03)00133-5} }
@article{Cai2003Supportb, author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.}, title = {Support vector machines for prediction of protein domain structural class.}, journal = {J. {T}heor. {B}iol.}, year = {2003}, volume = {221}, pages = {115-120}, number = {1}, abstract = {The support vector machines ({SVM}s) method was introduced for predicting the structural class of protein domains. {T}he results obtained through the self-consistency test, jack-knife test, and independent dataset test have indicated that the current method and the elegant component-coupled algorithm developed by {C}hou and co-workers, if effectively complemented with each other, may become a powerful tool for predicting the structural class of protein domains.}, doi = {10.1006/jtbi.2003.3179}, pdf = {../local/Cai2003Supportb.pdf}, file = {Cai2003Supportb.pdf:local/Cai2003Supportb.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1006/jtbi.2003.3179} }
@article{Cai2002Supporta, author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.}, title = {Support {V}ector {M}achines for predicting {HIV} protease cleavage sites in protein.}, journal = {J. {C}omput. {C}hem.}, year = {2002}, volume = {23}, pages = {267-274}, number = {2}, abstract = {Knowledge of the polyprotein cleavage sites by {HIV} protease will refine our understanding of its specificity, and the information thus acquired is useful for designing specific and efficient {HIV} protease inhibitors. {T}he pace in searching for the proper inhibitors of {HIV} protease will be greatly expedited if one can find an accurate, robust, and rapid method for predicting the cleavage sites in proteins by {HIV} protease. {I}n this article, a {S}upport {V}ector {M}achine is applied to predict the cleavability of oligopeptides by proteases with multiple and extended specificity subsites. {W}e selected {HIV}-1 protease as the subject of the study. {T}wo hundred ninety-nine oligopeptides were chosen for the training set, while the other 63 oligopeptides were taken as a test set. {B}ecause of its high rate of self-consistency (299/299 = 100%), a good result in the jackknife test (286/299 95%) and correct prediction rate (55/63 = 87%), it is expected that the {S}upport {V}ector {M}achine method can be referred to as a useful assistant technique for finding effective inhibitors of {HIV} protease, which is one of the targets in designing potential drugs against {AIDS}. {T}he principle of the {S}upport {V}ector {M}achine method can also be applied to analyzing the specificity of other multisubsite enzymes.}, doi = {10.1002/jcc.10017}, pdf = {../local/local}, file = {local:local/:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/jcc.10017} }
@article{Cai2002Supportb, author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.}, title = {Support vector machines for predicting the specificity of {{G}al{NA}c}-transferase}, journal = {Peptides}, year = {2002}, volume = {23}, pages = {205-208}, abstract = {Support {V}ector {M}achines ({SVM}s) which is one kind of learning machines, was applied to predict the specificity of {G}al{NA}c-transferase. {T}he examination for the self-consistency and the jackknife test of the {SVM}s method were tested for the training dataset (305 oligopeptides), the correct rate of self-consistency and jackknife test reaches 100% and 84.9%, respectively. {F}urthermore, the prediction of the independent testing dataset (30 oligopeptides) was tested, the rate reaches 76.67%.}, doi = {10.1016/S0196-9781(01)00597-6}, pdf = {../local/Cai2002Supportb.pdf}, file = {Cai2002Supportb.pdf:local/Cai2002Supportb.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0196-9781(01)00597-6} }
@article{Cai2002Supportc, author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.}, title = {Support vector machines for the classification and prediction of beta-turn types}, journal = {J. {P}ept. {S}ci.}, year = {2002}, volume = {8}, pages = {297-301}, number = {7}, abstract = {The support vector machines ({SVM}s) method is proposed because it can reflect the sequence-coupling effect for a tetrapeptide in not only a beta-turn or non-beta-turn, but also in different types of beta-turn. {T}he results of the model for 6022 tetrapeptides indicate that the rates of self-consistency for beta-turn types {I}, {I}', {II}, {II}', {VI} and {VIII} and non-beta-turns are 99.92%, 96.8%, 98.02%, 97.75%, 100%, 97.19% and 100%, respectively. {U}sing these training data, the rate of correct prediction by the {SVM}s for a given protein: rubredoxin (54 residues. 51 tetrapeptides) which includes 12 beta-turn type {I} tetrapeptides, 1 beta-turn type {II} tetrapeptide and 38 non-beta-turns reached 82.4%. {T}he high quality of prediction of the {SVM}s implies that the formation of different beta-turn types or non-beta-turns is considerably correlated with the sequence of a tetrapeptide. {T}he {SVM}s can save {CPU} time and avoid the overfitting problem compared with the neural network method.}, doi = {10.1002/psc.401}, pdf = {../local/local}, file = {local:local/:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/psc.401} }
@article{Cai2000Support, author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.}, title = {Support vector machines for prediction of protein subcellular location}, journal = {Mol. {C}ell {B}iol. {R}es. {C}ommun.}, year = {2000}, volume = {4}, pages = {230-234}, number = {4}, abstract = {Support {V}ector {M}achine ({SVM}), which is one kind of learning machines, was applied to predict the subcellular location of proteins from their amino acid composition. {I}n this research, the proteins are classified into the following 12 groups: (1) chloroplast, (2) cytoplasm, (3) cytoskeleton, (4) endoplasmic reticulum, (5) extracall, (6) {G}olgi apparatus, (7) lysosome, (8) mitochondria, (9) nucleus, (10) peroxisome, (11) plasma membrane, and (12) vacuole, which have covered almost all the organelles and subcellular compartments in an animal or plant cell. {T}he examination for the self-consistency and the jackknife test of the {SVM}s method was tested for the three sets: 2022 proteins, 2161 proteins, and 2319 proteins. {A}s a result, the correct rate of self-consistency and jackknife test reaches 91 and 82% for 2022 proteins, 89 and 75% for 2161 proteins, and 85 and 73% for 2319 proteins, respectively. {F}urthermore, the predicting rate was tested by the three independent testing datasets containing 2240 proteins, 2513 proteins, and 2591 proteins. {T}he correct prediction rates reach 82, 75, and 73% for 2240 proteins, 2513 proteins, and 2591 proteins, respectively.}, doi = {10.1006/mcbr.2001.0285}, pdf = {../local/local}, file = {local:local/:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1006/mcbr.2001.0285} }
@article{Cai2004Application, author = {Cai, Y.D. and Ricardo, P.W. and Jen, C.H. and Chou, K.C.}, title = {Application of {SVM} to predict membrane protein types.}, journal = {J. {T}heor. {B}iol.}, year = {2004}, volume = {226}, pages = {373-376}, number = {4}, abstract = {As a continuous effort to develop automated methods for predicting membrane protein types that was initiated by {C}hou and {E}lrod ({PROTEINS}: {S}tructure, {F}unction, and {G}enetics, 1999, 34, 137-153), the support vector machine ({SVM}) is introduced. {R}esults obtained through re-substitution, jackknife, and independent data set tests, respectively, have indicated that the {SVM} approach is quite a promising one, suggesting that the covariant discriminant algorithm ({C}hou and {E}lrod, {P}rotein {E}ng. 12 (1999) 107) and {SVM}, if effectively complemented with each other, will become a powerful tool for predicting membrane protein types and the other protein attributes as well.}, doi = {10.1016/j.jtbi.2003.08.015}, pdf = {../local/Cai2004Application.pdf}, file = {Cai2004Application.pdf:local/Cai2004Application.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.jtbi.2003.08.015} }
@article{Cai2004Identify, author = {Cai, Y.D. and Zhou, G.P. and Jen, C.H. and Lin, S.L. and Chou, K.C.}, title = {Identify catalytic triads of serine hydrolases by support vector machines.}, journal = {J. {T}heor. {B}iol.}, year = {2004}, volume = {228}, pages = {551-557}, number = {4}, abstract = {The core of an enzyme molecule is its active site from the viewpoints of both academic research and industrial application. {T}o reveal the structural and functional mechanism of an enzyme, one needs to know its active site; to conduct structure-based drug design by regulating the function of an enzyme, one needs to know the active site and its microenvironment as well. {G}iven the atomic coordinates of an enzyme molecule, how can we predict its active site? {T}o tackle such a problem, a distance group approach was proposed and the support vector machine algorithm applied to predict the catalytic triad of serine hydrolase family. {T}he success rate by jackknife test for the 139 serine hydrolases was 85%, implying that the method is quite promising and may become a useful tool in structural bioinformatics.}, doi = {10.1016/j.jtbi.2004.02.019}, pdf = {../local/Cai2004Identify.pdf}, file = {Cai2004Identify.pdf:local/Cai2004Identify.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.jtbi.2004.02.019} }
@article{Cai2004Prediction, author = {Yu-Dong Cai and Andrew J Doig}, title = {Prediction of {S}accharomyces cerevisiae protein functional class from functional domain composition.}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {1292-300}, number = {8}, month = {May}, abstract = {M{OTIVATION}: {A} key goal of genomics is to assign function to genes, especially for orphan sequences. {RESULTS}: {W}e compared the clustered functional domains in the {SBASE} database to each protein sequence using {BLASTP}. {T}his representation for a protein is a vector, where each of the non-zero entries in the vector indicates a significant match between the sequence of interest and the {SBASE} domain. {T}he machine learning methods nearest neighbour algorithm ({NNA}) and support vector machines are used for predicting protein functional classes from this information. {W}e find that the best results are found using the {SBASE}-{A} database and the {NNA}, namely 72\% accuracy for 79\% coverage. {W}e tested an assigning function based on searching for {I}nter{P}ro sequence motifs and by taking the most significant {BLAST} match within the dataset. {W}e applied the functional domain composition method to predict the functional class of 2018 currently unclassified yeast open reading frames. {AVAILABILITY}: {A} program for the prediction method, that uses {NNA} called {F}unctional {C}lass {P}rediction based on {F}unctional {D}omains ({FCPFD}) is available and can be obtained by contacting {Y}.{D}.{C}ai at y.cai@umist.ac.uk}, doi = {10.1093/bioinformatics/bth085}, pdf = {../local/Cai2004Prediction.pdf}, file = {Cai2004Prediction.pdf:local/Cai2004Prediction.pdf:PDF}, keywords = {biosvm}, pii = {bth085}, url = {http://dx.doi.org/10.1093/bioinformatics/bth085} }
@article{Cai2002Support, author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Chou, K.-C.}, title = {Support vector machines for prediction of protein subcellular location by incorporating quasi-sequence-order effect}, journal = {J. {C}ell. {B}iochem.}, year = {2002}, volume = {84}, pages = {343-348}, number = {2}, abstract = {Support {V}ector {M}achine ({SVM}), which is one class of learning machines, was applied to predict the subcellular location of proteins by incorporating the quasi-sequence-order effect ({C}hou [2000] {B}iochem. {B}iophys. {R}es. {C}ommun. 278:477-483). {I}n this study, the proteins are classified into the following 12 groups: (1) chloroplast, (2) cytoplasm, (3) cytoskeleton, (4) endoplasmic reticulum, (5) extracellular, (6) {G}olgi apparatus, (7) lysosome, (8) mitochondria, (9) nucleus, (10) peroxisome, (11) plasma membrane, and (12) vacuole, which account for most organelles and subcellular compartments in an animal or plant cell. {E}xaminations for self-consistency and jackknife testing of the {SVM}s method were conducted for three sets consisting of 1,911, 2,044, and 2,191 proteins. {T}he correct rates for self-consistency and the jackknife test values achieved with these protein sets were 94 and 83% for 1,911 proteins, 92 and 78% for 2,044 proteins, and 89 and 75% for 2,191 proteins, respectively. {F}urthermore, tests for correct prediction rates were undertaken with three independent testing datasets containing 2,148 proteins, 2,417 proteins, and 2,494 proteins producing values of 84, 77, and 74%, respectively.}, doi = {10.1002/jcb.10030}, pdf = {../local/Cai2002Support.pdf}, file = {Cai2002Support.pdf:local/Cai2002Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/jcb.10030} }
@article{Cai2002Prediction, author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Zhou, G.-P.}, title = {Prediction of protein structural classes by support vector machines.}, journal = {Comput. {C}hem.}, year = {2002}, volume = {26}, pages = {293-296}, number = {3}, abstract = {In this paper, we apply a new machine learning method which is called support vector machine to approach the prediction of protein structural class. {T}he support vector machine method is performed based on the database derived from {SCOP} which is based upon domains of known structure and the evolutionary relationships and the principles that govern their 3{D} structure. {A}s a result, high rates of both self-consistency and jackknife test are obtained. {T}his indicates that the structural class of a protein inconsiderably correlated with its amino and composition, and the support vector machine can be referred as a powerful computational tool for predicting the structural classes of proteins.}, doi = {10.1016/S0097-8485(01)00113-9}, pdf = {../local/Cai2002Prediction.pdf}, file = {Cai2002Prediction.pdf:local/Cai2002Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0097-8485(01)00113-9} }
@article{Cai2001Support, author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Zhou, G.-P.}, title = {Support {V}ector {M}achines for predicting protein structural class}, journal = {B{MC} {B}ioinformatics}, year = {2001}, volume = {2}, pages = {3}, number = {3}, abstract = {Background {W}e apply a new machine learning method, the so-called {S}upport {V}ector {M}achine method, to predict the protein structural class. {S}upport {V}ector {M}achine method is performed based on the database derived from {SCOP}, in which protein domains are classified based on known structures and the evolutionary relationships and the principles that govern their 3-{D} structure. {R}esults {H}igh rates of both self-consistency and jackknife tests are obtained. {T}he good results indicate that the structural class of a protein is considerably correlated with its amino acid composition. {C}onclusions {I}t is expected that the {S}upport {V}ector {M}achine method and the elegant component-coupled method, also named as the covariant discrimination algorithm, if complemented with each other, can provide a powerful computational tool for predicting the structural classes of proteins.}, doi = {10.1186/1471-2105-2-3}, pdf = {../local/Cai2001Support.pdf}, file = {Cai2001Support.pdf:local/Cai2001Support.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/2/3/abstract} }
@article{Cai2003Support, author = {Cai, Y.-D. and Zhou, G.-P. and Chou, K.-C.}, title = {Support {V}ector {M}achines for {P}redicting {M}embrane {P}rotein {T}ypes by {U}sing {F}unctional {D}omain {C}omposition}, journal = {Biophys. {J}.}, year = {2003}, volume = {84}, pages = {3257-3263}, number = {5}, abstract = {Membrane proteins are generally classified into the following five types: 1), type {I} membrane protein; 2), type {II} membrane protein; 3), multipass transmembrane proteins; 4), lipid chain-anchored membrane proteins; and 5), {GPI}-anchored membrane proteins. {I}n this article, based on the concept of using the functional domain composition to define a protein, the {S}upport {V}ector {M}achine algorithm is developed for predicting the membrane protein type. {H}igh success rates are obtained by both the self-consistency and jackknife tests. {T}he current approach, complemented with the powerful covariant discriminant algorithm based on the pseudo-amino acid composition that has incorporated quasi-sequence-order effect as recently proposed by {K}. {C}. {C}hou (2001), may become a very useful high-throughput tool in the area of bioinformatics and proteomics.}, pdf = {../local/Cai2003Support.pdf}, file = {Cai2003Support.pdf:local/Cai2003Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.biophysj.org/cgi/content/abstract/84/5/3257} }
@article{Camps-Valls2004Profiled, author = {Camps-Valls, G. and Chalk, A.M. and Serrano-Lopez, A.J. and Martin-Guerrero, J.D. and Sonnhammer, E.L.}, title = {Profiled support vector machines for antisense oligonucleotide efficacy prediction.}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, pages = {135}, number = {135}, abstract = {Background {T}his paper presents the use of {S}upport {V}ector {M}achines ({SVM}s) for prediction and analysis of antisense oligonucleotide ({AO}) efficacy. {T}he collected database comprises 315 {AO} molecules including 68 features each, inducing a problem well-suited to {SVM}s. {T}he task of feature selection is crucial given the presence of noisy or redundant features, and the well-known problem of the curse of dimensionality. {W}e propose a two-stage strategy to develop an optimal model: (1) feature selection using correlation analysis, mutual information, and {SVM}-based recursive feature elimination ({SVM}-{RFE}), and (2) {AO} prediction using standard and profiled {SVM} formulations. {A} profiled {SVM} gives different weights to different parts of the training data to focus the training on the most important regions. {R}esults {I}n the first stage, the {SVM}-{RFE} technique was most efficient and robust in the presence of low number of samples and high input space dimension. {T}his method yielded an optimal subset of 14 representative features, which were all related to energy and sequence motifs. {T}he second stage evaluated the performance of the predictors (overall correlation coefficient between observed and predicted efficacy, r; mean error, {ME}; and root-mean-square-error, {RMSE}) using 8-fold and minus-one-{RNA} cross-validation methods. {T}he profiled {SVM} produced the best results (r = 0.44, {ME} = 0.022, and {RMSE}= 0.278) and predicted high (>75% inhibition of gene expression) and low efficacy (<25%) {AO}s with a success rate of 83.3% and 82.9%, respectively, which is better than by previous approaches. {A} web server for {AO} prediction is available online at http://aosvm.cgb.ki.se/. {C}onclusions {T}he {SVM} approach is well suited to the {AO} prediction problem, and yields a prediction accuracy superior to previous methods. {T}he profiled {SVM} was found to perform better than the standard {SVM}, suggesting that it could lead to improvements in other prediction problems as well.}, doi = {10.1186/1471-2105-5-135}, pdf = {../local/Camps-Valls2004Profiled.pdf}, file = {Camps-Valls2004Profiled.pdf:local/Camps-Valls2004Profiled.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.biomedcentral.com/1471-2105/5/135} }
@article{Capriotti2005I-Mutant, author = {Capriotti, E. and Fariselli, P. and Casadio, R.}, title = {I-{M}utant2.0: predicting stability changes upon mutation from the protein sequence or structure.}, journal = {Nucleic {A}cids {R}es.}, year = {2005}, volume = {33}, pages = {W306-10}, number = {Web Server issue}, month = {Jul}, abstract = {I-{M}utant2.0 is a support vector machine ({SVM})-based tool for the automatic prediction of protein stability changes upon single point mutations. {I}-{M}utant2.0 predictions are performed starting either from the protein structure or, more importantly, from the protein sequence. {T}his latter task, to the best of our knowledge, is exploited for the first time. {T}he method was trained and tested on a data set derived from {P}ro{T}herm, which is presently the most comprehensive available database of thermodynamic experimental data of free energy changes of protein stability upon mutation under different conditions. {I}-{M}utant2.0 can be used both as a classifier for predicting the sign of the protein stability change upon mutation and as a regression estimator for predicting the related {D}elta{D}elta{G} values. {A}cting as a classifier, {I}-{M}utant2.0 correctly predicts (with a cross-validation procedure) 80\% or 77\% of the data set, depending on the usage of structural or sequence information, respectively. {W}hen predicting {D}elta{D}elta{G} values associated with mutations, the correlation of predicted with expected/experimental values is 0.71 (with a standard error of 1.30 kcal/mol) and 0.62 (with a standard error of 1.45 kcal/mol) when structural or sequence information are respectively adopted. {O}ur web interface allows the selection of a predictive mode that depends on the availability of the protein structure and/or sequence. {I}n this latter case, the web server requires only pasting of a protein sequence in a raw format. {W}e therefore introduce {I}-{M}utant2.0 as a unique and valuable helper for protein design, even when the protein structure is not yet known with atomic resolution. {A}vailability: http://gpcr.biocomp.unibo.it/cgi/predictors/{I}-{M}utant2.0/{I}-{M}utant2.0.cgi.}, doi = {10.1093/nar/gki375}, pdf = {../local/local}, file = {local:local/:PDF}, keywords = {biosvm}, pii = {33/suppl_2/W306}, url = {http://dx.doi.org/10.1093/nar/gki375} }
@article{Carter2001computational, author = {Carter, R. J. and Dubchak, I. and Holbrook, S. R.}, title = {A computational approach to identify genes for functional {{RNA}s} in genomic sequences}, journal = {Nucl. {A}cids {R}es.}, year = {2001}, volume = {29}, pages = {3928-3938}, number = {19}, abstract = {Currently there is no successful computational approach for identification of genes encoding novel functional {RNA}s (f{RNA}s) in genomic sequences. {W}e have developed a machine learning approach using neural networks and support vector machines to extract common features among known {RNA}s for prediction of new {RNA} genes in the unannotated regions of prokaryotic and archaeal genomes. {T}he {E}scherichia coli genome was used for development, but we have applied this method to several other bacterial and archaeal genomes. {N}etworks based on nucleotide composition were 80-90% accurate in jackknife testing experiments for bacteria and 90-99% for hyperthermophilic archaea. {W}e also achieved a significant improvement in accuracy by combining these predictions with those obtained using a second set of parameters consisting of known {RNA} sequence motifs and the calculated free energy of folding. {S}everal known f{RNA}s not included in the training datasets were identified as well as several hundred predicted novel {RNA}s. {T}hese studies indicate that there are many unidentified {RNA}s in simple genomes that can be predicted computationally as a precursor to experimental study. {P}ublic access to our {RNA} gene predictions and an interface for user predictions is available via the web.}, pdf = {../local/Carter2001computational.pdf}, file = {Carter2001computational.pdf:local/Carter2001computational.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://nar.oupjournals.org/cgi/content/abstract/29/19/3928} }
@article{Chen2004Prediction, author = {Chen, Y.C. and Lin, Y.S. and Lin, C.J. and Hwang, J.K.}, title = {Prediction of the bonding states of cysteines using the support vector machines based on multiple feature vectors and cysteine state sequences}, journal = {Proteins}, year = {2004}, volume = {55}, pages = {1036-1042}, number = {4}, abstract = {The support vector machine ({SVM}) method is used to predict the bonding states of cysteines. {B}esides using local descriptors such as the local sequences, we include global information, such as amino acid compositions and the patterns of the states of cysteines (bonded or nonbonded), or cysteine state sequences, of the proteins. {W}e found that {SVM} based on local sequences or global amino acid compositions yielded similar prediction accuracies for the data set comprising 4136 cysteine-containing segments extracted from 969 nonhomologous proteins. {H}owever, the {SVM} method based on multiple feature vectors (combining local sequences and global amino acid compositions) significantly improves the prediction accuracy, from 80% to 86%. {I}f coupled with cysteine state sequences, {SVM} based on multiple feature vectors yields 90% in overall prediction accuracy and a 0.77 {M}atthews correlation coefficient, around 10% and 22% higher than the corresponding values obtained by {SVM} based on local sequence information.}, doi = {10.1002/prot.20079}, pdf = {../local/Chen2004Prediction.pdf}, file = {Chen2004Prediction.pdf:local/Chen2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/Chen2004Prediction.pdf} }
@article{Chen2005Understanding, author = {Chen, Y. and Xu, D.}, title = {Understanding protein dispensability through machine-learning analysis of high-throughput data}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {575-581}, month = {Mar}, abstract = {Motivation: {P}rotein dispensability is fundamental to understanding of gene function and evolution. {R}ecent advances in generating high-throughput data such as genomic sequence data, protein-protein interaction data, gene-expression data, and growth-rate data of mutants allow us to investigate protein dispensability systematically at the genome scale.{R}esults: {I}n our studies, protein dispensability is represented as a fitness score that is measured by the growth rate of gene-deletion mutants. {T}hrough analyses of high-throughput data in yeast {S}accharomyces cerevisia, we found that a protein's dispensability had significant correlations with its evolutionary rate and duplication rate, as well as its connectivity in protein-protein interaction network and gene-expression correlation network. {N}eural network and support vector machine were applied to predict protein dispensability through high-throughput data. {O}ur studies shed some lights on global characteristics of protein dispensability and evolution.{A}vailability: {T}he original datasets for protein dispensability analysis and prediction, together with related scripts, are available at http://digbio.missouri.edu/~ychen/{P}ro{D}ispen/.}, doi = {10.1093/bioinformatics/bti058}, pdf = {../local/Chen2005Understanding.pdf}, file = {Chen2005Understanding.pdf:local/Chen2005Understanding.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/bioinformatics/bti058} }
@article{Chou2002Using, author = {Chou, K.-C. and Cai, Y.-D.}, title = {Using {F}unctional {D}omain {C}omposition and {S}upport {V}ector {M}achines for {P}rediction of {P}rotein {S}ubcellular {L}ocation}, journal = {J. {B}iol. {C}hem.}, year = {2002}, volume = {277}, pages = {45765-45769}, number = {48}, abstract = {Proteins are generally classified into the following 12 subcellular locations: 1) chloroplast, 2) cytoplasm, 3) cytoskeleton, 4) endoplasmic reticulum, 5) extracellular, 6) {G}olgi apparatus, 7) lysosome, 8) mitochondria, 9) nucleus, 10) peroxisome, 11) plasma membrane, and 12) vacuole. {B}ecause the function of a protein is closely correlated with its subcellular location, with the rapid increase in new protein sequences entering into databanks, it is vitally important for both basic research and pharmaceutical industry to establish a high throughput tool for predicting protein subcellular location. {I}n this paper, a new concept, the so-called "functional domain composition" is introduced. {B}ased on the novel concept, the representation for a protein can be defined as a vector in a high-dimensional space, where each of the clustered functional domains derived from the protein universe serves as a vector base. {W}ith such a novel representation for a protein, the support vector machine ({SVM}) algorithm is introduced for predicting protein subcellular location. {H}igh success rates are obtained by the self-consistency test, jackknife test, and independent dataset test, respectively. {T}he current approach not only can play an important complementary role to the powerful covariant discriminant algorithm based on the pseudo amino acid composition representation ({C}hou, {K}. {C}. (2001) {P}roteins {S}truct. {F}unct. {G}enet. 43, 246-255; {C}orrection (2001) {P}roteins {S}truct. {F}unct. {G}enet. 44, 60), but also may greatly stimulate the development of this area.}, pdf = {../local/Chou2002Using.pdf}, file = {Chou2002Using.pdf:local/Chou2002Using.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.jbc.org/cgi/content/abstract/277/48/45765} }
@article{Chow2001Identifying, author = {M. L. Chow and E. J. Moler and I. S. Mian}, title = {Identifying marker genes in transcription profiling data using a mixture of feature relevance experts.}, journal = {Physiol. {G}enomics}, year = {2001}, volume = {5}, pages = {99-111}, number = {2}, month = {Mar}, abstract = {Transcription profiling experiments permit the expression levels of many genes to be measured simultaneously. {G}iven profiling data from two types of samples, genes that most distinguish the samples (marker genes) are good candidates for subsequent in-depth experimental studies and developing decision support systems for diagnosis, prognosis, and monitoring. {T}his work proposes a mixture of feature relevance experts as a method for identifying marker genes and illustrates the idea using published data from samples labeled as acute lymphoblastic and myeloid leukemia ({ALL}, {AML}). {A} feature relevance expert implements an algorithm that calculates how well a gene distinguishes samples, reorders genes according to this relevance measure, and uses a supervised learning method [here, support vector machines ({SVM}s)] to determine the generalization performances of different nested gene subsets. {T}he mixture of three feature relevance experts examined implement two existing and one novel feature relevance measures. {F}or each expert, a gene subset consisting of the top 50 genes distinguished {ALL} from {AML} samples as completely as all 7,070 genes. {T}he 125 genes at the union of the top 50s are plausible markers for a prototype decision support system. {C}hromosomal aberration and other data support the prediction that the three genes at the intersection of the top 50s, cystatin {C}, azurocidin, and adipsin, are good targets for investigating the basic biology of {ALL}/{AML}. {T}he same data were employed to identify markers that distinguish samples based on their labels of {T} cell/{B} cell, peripheral blood/bone marrow, and male/female. {S}elenoprotein {W} may discriminate {T} cells from {B} cells. {R}esults from analysis of transcription profiling data from tumor/nontumor colon adenocarcinoma samples support the general utility of the aforementioned approach. {T}heoretical issues such as choosing {SVM} kernels and their parameters, training and evaluating feature relevance experts, and the impact of potentially mislabeled samples on marker identification (feature selection) are discussed.}, pdf = {../local/Chow2001Identifying.pdf}, file = {Chow2001Identifying.pdf:local/Chow2001Identifying.pdf:PDF}, keywords = {biosvm}, pii = {5/2/99}, url = {http://physiolgenomics.physiology.org/cgi/content/abstract/5/2/99} }
@article{Collier2004Comparison, author = {Nigel Collier and Koichi Takeuchi}, title = {Comparison of character-level and part of speech features for name recognition in biomedical texts.}, journal = {J {B}iomed {I}nform}, year = {2004}, volume = {37}, pages = {423-35}, number = {6}, month = {Dec}, abstract = {The immense volume of data which is now available from experiments in molecular biology has led to an explosion in reported results most of which are available only in unstructured text format. {F}or this reason there has been great interest in the task of text mining to aid in fact extraction, document screening, citation analysis, and linkage with large gene and gene-product databases. {I}n particular there has been an intensive investigation into the named entity ({NE}) task as a core technology in all of these tasks which has been driven by the availability of high volume training sets such as the {GENIA} v3.02 corpus. {D}espite such large training sets accuracy for biology {NE} has proven to be consistently far below the high levels of performance in the news domain where {F} scores above 90 are commonly reported which can be considered near to human performance. {W}e argue that it is crucial that more rigorous analysis of the factors that contribute to the model's performance be applied to discover where the underlying limitations are and what our future research direction should be. {O}ur investigation in this paper reports on variations of two widely used feature types, part of speech ({POS}) tags and character-level orthographic features, and makes a comparison of how these variations influence performance. {W}e base our experiments on a proven state-of-the-art model, support vector machines using a high quality subset of 100 annotated {MEDLINE} abstracts. {E}xperiments reveal that the best performing features are orthographic features with {F} score of 72.6. {A}lthough the {B}rill tagger trained in-domain on the {GENIA} v3.02p {POS} corpus gives the best overall performance of any {POS} tagger, at an {F} score of 68.6, this is still significantly below the orthographic features. {I}n combination these two features types appear to interfere with each other and degrade performance slightly to an {F} score of 72.3.}, doi = {10.1016/j.jbi.2004.08.008}, pdf = {../local/Collier2004Comparison.pdf}, file = {Collier2004Comparison.pdf:local/Collier2004Comparison.pdf:PDF}, keywords = {biosvm nlp}, pii = {S1532-0464(04)00088-7}, url = {http://dx.doi.org/10.1016/j.jbi.2004.08.008} }
@article{Cui2004Esub8, author = {Cui, Q. and Jiang, T. and Liu, B. and Ma, S.}, title = {Esub8: {A} novel tool to predict protein subcellular localizations in eukaryotic organisms}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, pages = {66}, number = {66}, abstract = {Background {S}ubcellular localization of a new protein sequence is very important and fruitful for understanding its function. {A}s the number of new genomes has dramatically increased over recent years, a reliable and efficient system to predict protein subcellular location is urgently needed. {R}esults {E}sub8 was developed to predict protein subcellular localizations for eukaryotic proteins based on amino acid composition. {I}n this research, the proteins are classified into the following eight groups: chloroplast, cytoplasm, extracellular, {G}olgi apparatus, lysosome, mitochondria, nucleus and peroxisome. {W}e know subcellular localization is a typical classification problem; consequently, a one-against-one (1-v-1) multi-class support vector machine was introduced to construct the classifier. {U}nlike previous methods, ours considers the order information of protein sequences by a different method. {O}ur method is tested in three subcellular localization predictions for prokaryotic proteins and four subcellular localization predictions for eukaryotic proteins on {R}einhardt's dataset. {T}he results are then compared to several other methods. {T}he total prediction accuracies of two tests are both 100% by a self-consistency test, and are 92.9% and 84.14% by the jackknife test, respectively. {E}sub8 also provides excellent results: the total prediction accuracies are 100% by a self-consistency test and 87% by the jackknife test. {C}onclusions {O}ur method represents a different approach for predicting protein subcellular localization and achieved a satisfactory result; furthermore, we believe {E}sub8 will be a useful tool for predicting protein subcellular localizations in eukaryotic organisms.}, doi = {10.1186/1471-2105-5-66}, pdf = {../local/Cui2004Esub8.pdf}, file = {Cui2004Esub8.pdf:local/Cui2004Esub8.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/5/66} }
@article{Cuturi2005context-tree, author = {Cuturi, M. and Vert, J.-P.}, title = {The context-tree kernel for strings}, journal = {Neural {N}etwork.}, year = {2005}, volume = {18}, pages = {1111-1123}, number = {4}, abstract = {We propose a new kernel for strings which borrows ideas and techniques from information theory and data compression. {T}his kernel can be used in combination with any kernel method, in particular {S}upport {V}ector {M}achines for string classi- fication, with notable applications in proteomics. {B}y using a {B}ayesian averaging framework with conjugate priors on a class of {M}arkovian models known as prob- abilistic suffix trees or context-trees, we compute the value of this kernel in linear time and space while only using the information contained in the spectrum of the considered strings. {T}his is ensured through an adaptation of a compression method known as the context-tree weighting algorithm. {E}ncouraging classification results are reported on a standard protein homology detection experiment, showing that the context-tree kernel performs well with respect to other state-of-the-art methods while using no biological prior knowledge.}, doi = {10.1016/j.neunet.2005.07.010}, pdf = {../local/Cuturi2005context-tree.pdf}, file = {Cuturi2005context-tree.pdf:local/Cuturi2005context-tree.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1016/j.neunet.2005.07.010} }
@inproceedings{Cuturi2004mutual, author = {Cuturi, M. and Vert, J.-P.}, title = {A mutual information kernel for strings}, booktitle = {Proceedings of {IJCNN} 2004}, year = {2004}, pages = {1904-1910}, pdf = {../local/ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF;ijcnn04.pdf:http\}, file = {ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF;ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Degroeve2002Feature, author = {Degroeve, S. and De Baets, B. and Van de Peer, Y. and Rouze, P.}, title = {Feature subset selection for splice site prediction}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {S75-S83}, number = {Suppl. 1}, abstract = {Motivation: {T}he large amount of available annotated {A}rabidopsis thaliana sequences allows the induction of splice site prediction models with supervised learning algorithms (see {H}aussler (1998) for a review and references). {T}hese algorithms need information sources or features from which the models can be computed. {F}or splice site prediction, the features we consider in this study are the presence or absence of certain nucleotides in close proximity to the splice site. {S}ince it is not known how many and which nucleotides are relevant for splice site prediction, the set of features is chosen large enough such that the probability that all relevant information sources are in the set is very high. {U}sing only those features that are relevant for constructing a splice site prediction system might improve the system and might also provide us with useful biological knowledge. {U}sing fewer features will of course also improve the prediction speed of the system. {R}esults: {A} wrapper-based feature subset selection algorithm using a support vector machine or a naive {B}ayes prediction method was evaluated against the traditional method for selecting features relevant for splice site prediction. {O}ur results show that this wrapper approach selects features that improve the performance against the use of all features and against the use of the features selected by the traditional method. {A}vailability: {T}he data and additional interactive graphs on the selected feature subsets are available at http://www.psb.rug.ac.be/gps {C}ontact: svgro@gengenp.rug.ac.be yvdp@gengenp.rug.ac.be}, pdf = {../local/Degroeve2002Feature.pdf}, file = {Degroeve2002Feature.pdf:local/Degroeve2002Feature.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/suppl_2/S75} }
@article{Degroeve2005SpliceMachine, author = {Degroeve, S. and Saeys, Y. and De Baets, B. and Rouze, P. and Van de Peer, Y.}, title = {{{S}plice{M}achine}: predicting splice sites from high-dimensional local context representations}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1332-1338}, abstract = {Motivation: {I}n this age of complete genome sequencing, finding the location and structure of genes is crucial for further molecular research. {T}he accurate prediction of intron boundaries largely facilitates the correct prediction of gene structure in nuclear genomes. {M}any tools for localizing these boundaries on {DNA} sequences have been developed and are available to researchers through the internet. {N}evertheless, these tools still make many false positive predictions.{R}esults: {T}his manuscript presents a novel publicly available splice site prediction tool named {S}plice{M}achine that (i) shows state-of-the-art prediction performance on {A}rabidopsis thaliana and human sequences, (ii) performs a computationally fast annotation, and (iii) can be trained by the user on its own data.{A}vailability: {R}esults, figures and software are available at http://bioinformatics.psb.ugent.be/supplementary_data/.}, doi = {10.1093/bioinformatics/bti166}, pdf = {../local/Degroeve2005SpliceMachine.pdf}, file = {Degroeve2005SpliceMachine.pdf:local/Degroeve2005SpliceMachine.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti166v1} }
@inproceedings{Deshpande2002Evaluation, author = {Deshpande, M. and Karypis, G.}, title = {Evaluation of {T}echniques for {C}lassifying {B}iological {S}equences}, booktitle = {P{AKDD} '02: {P}roceedings of the 6th {P}acific-{A}sia {C}onference on {A}dvances in {K}nowledge {D}iscovery and {D}ata {M}ining}, year = {2002}, pages = {417--431}, publisher = {Springer Verlag}, abstract = {In recent years we have witnessed an exponential increase in the amount of biological information, either {DNA} or protein sequences, that has become available in public databases. {T}his has been followed by an increased interest in developing computational techniques to automatically classify these large volumes of sequence data into various categories corresponding to either their role in the chromosomes, their structure, and/or their function. {I}n this paper we evaluate some of the widely-used sequence classification algorithms and develop a framework for modeling sequences in a fashion so that traditional machine learning algorithms, such as support vector machines, can be applied easily. {O}ur detailed experimental evaluation shows that the {SVM}-based approaches are able to achieve higher classification accuracy compared to the more traditional sequence classification algorithms such as {M}arkov model based techniques and {K}-nearest neighbor based approaches.}, pdf = {../local/Deshpande2002Evaluation.pdf}, file = {Deshpande2002Evaluation.pdf:local/Deshpande2002Evaluation.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Ding2001Multi-class, author = {Ding, C.H.Q. and Dubchak, I.}, title = {Multi-class protein fold recognition using support vector machines and neural networks}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {349--358}, abstract = {Motivation: {P}rotein fold recognition is an important approach to structure discovery without relying on sequence similarity. {W}e study this approach with new multi-class classification methods and examined many issues important for a practical recognition system. {R}esults: {M}ost current discriminative methods for protein fold prediction use the one-against-others method, which has the well-known ?{F}alse {P}ositives? problem. {W}e investigated two new methods: the unique one-against-others and the all-against-all methods. {B}oth improve prediction accuracy by 14?110% on a dataset containing 27 {SCOP} folds. {W}e used the {S}upport {V}ector {M}achine ({SVM}) and the {N}eural {N}etwork ({NN}) learning methods as base classifiers. {SVM}s converges fast and leads to high accuracy. {W}hen scores of multiple parameter datasets are combined, majority voting reduces noise and increases recognition accuracy. {W}e examined many issues involved with large number of classes, including dependencies of prediction accuracy on the number of folds and on the number of representatives in a fold. {O}verall, recognition systems achieve 56% fold prediction accuracy on a protein test dataset, where most of the proteins have below 25% sequence identity with the proteins used in training. {S}upplementary information: {T}he protein parameter datasets used in this paper are available online (http://www.nersc.gov/~cding/protein).}, pdf = {../local/Ding2001Multi-class.pdf}, file = {Ding2001Multi-class.pdf:local/Ding2001Multi-class.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://bioinformatics.oupjournals.org/cgi/reprint/17/4/349.pdf} }
@article{Dobson2005Predicting, author = {Dobson, P.D. and Doig, A.J.}, title = {Predicting enzyme class from protein structure without alignments}, journal = {J. {M}ol. {B}iol.}, year = {2005}, volume = {345}, pages = {187-199}, number = {1}, month = {Jan}, abstract = {Methods for predicting protein function from structure are becoming more important as the rate at which structures are solved increases more rapidly than experimental knowledge. {A}s a result, protein structures now frequently lack functional annotations. {T}he majority of methods for predicting protein function are reliant upon identifying a similar protein and transferring its annotations to the query protein. {T}his method fails when a similar protein cannot be identified, or when any similar proteins identified also lack reliable annotations. {H}ere, we describe a method that can assign function from structure without the use of algorithms reliant upon alignments. {U}sing simple attributes that can be calculated from any crystal structure, such as secondary structure content, amino acid propensities, surface properties and ligands, we describe each enzyme in a non-redundant set. {T}he set is split according to {E}nzyme {C}lassification ({EC}) number. {W}e combine the predictions of one-class versus one-class support vector machine models to make overall assignments of {EC} number to an accuracy of 35% with the top-ranked prediction, rising to 60% accuracy with the top two ranks. {I}n doing so we demonstrate the utility of simple structural attributes in protein function prediction and shed light on the link between structure and function. {W}e apply our methods to predict the function of every currently unclassified protein in the {P}rotein {D}ata {B}ank.}, doi = {10.1016/j.jmb.2004.10.024}, pdf = {../local/Dobson2005Predicting.pdf}, file = {Dobson2005Predicting.pdf:local/Dobson2005Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.jmb.2004.10.024} }
@article{Dobson2003Distinguishing, author = {Dobson, P.D. and Doig, A.J.}, title = {Distinguishing enzyme structures from non-enzymes without alignments}, journal = {J. {M}ol. {B}iol.}, year = {2003}, volume = {330}, pages = {771-783}, number = {4}, abstract = {The ability to predict protein function from structure is becoming increasingly important as the number of structures resolved is growing more rapidly than our capacity to study function. {C}urrent methods for predicting protein function are mostly reliant on identifying a similar protein of known function. {F}or proteins that are highly dissimilar or are only similar to proteins also lacking functional annotations, these methods fail. {H}ere, we show that protein function can be predicted as enzymatic or not without resorting to alignments. {W}e describe 1178 high-resolution proteins in a structurally non-redundant subset of the {P}rotein {D}ata {B}ank using simple features such as secondary-structure content, amino acid propensities, surface properties and ligands. {T}he subset is split into two functional groupings, enzymes and non-enzymes. {W}e use the support vector machine-learning algorithm to develop models that are capable of assigning the protein class. {V}alidation of the method shows that the function can be predicted to an accuracy of 77% using 52 features to describe each protein. {A}n adaptive search of possible subsets of features produces a simplified model based on 36 features that predicts at an accuracy of 80%. {W}e compare the method to sequence-based methods that also avoid calculating alignments and predict a recently released set of unrelated proteins. {T}he most useful features for distinguishing enzymes from non-enzymes are secondary-structure content, amino acid frequencies, number of disulphide bonds and size of the largest cleft. {T}his method is applicable to any structure as it does not require the identification of sequence or structural similarity to a protein of known function.}, doi = {10.1016/S0022-2836(03)00628-4}, pdf = {../local/Dobson2003Distinguishing.pdf}, file = {Dobson2003Distinguishing.pdf:local/Dobson2003Distinguishing.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0022-2836(03)00628-4} }
@article{Donaldson2003PreBIND, author = {Donaldson, I. and Martin, J. and de Bruijn, B. and Wolting, C. and Lay, V. and Tuekam, B. and Zhang, S. and Baskin, B. and Bader, G.D. and Michalickova, K. and Pawson, T. and Hogue, C.W.V. }, title = {{{P}re{BIND}} and {T}extomy - mining the biomedical literature for protein-protein interactions using a support vector machine}, journal = {B{MC} {B}ioinformatics}, year = {2003}, volume = {4}, pages = {11}, number = {1}, month = {Mar}, abstract = {Background {T}he majority of experimentally verified molecular interaction and biological pathway data are present in the unstructured text of biomedical journal articles where they are inaccessible to computational methods. {T}he {B}iomolecular interaction network database ({BIND}) seeks to capture these data in a machine-readable format. {W}e hypothesized that the formidable task-size of backfilling the database could be reduced by using {S}upport {V}ector {M}achine technology to first locate interaction information in the literature. {W}e present an information extraction system that was designed to locate protein-protein interaction data in the literature and present these data to curators and the public for review and entry into {BIND}. {R}esults {C}ross-validation estimated the support vector machine's test-set precision, accuracy and recall for classifying abstracts describing interaction information was 92%, 90% and 92% respectively. {W}e estimated that the system would be able to recall up to 60% of all non-high throughput interactions present in another yeast-protein interaction database. {F}inally, this system was applied to a real-world curation problem and its use was found to reduce the task duration by 70% thus saving 176 days. {C}onclusions {M}achine learning methods are useful as tools to direct interaction and pathway database back-filling; however, this potential can only be realized if these techniques are coupled with human review and entry into a factual database such as {BIND}. {T}he {P}re{BIND} system described here is available to the public at http://bind.ca. {C}urrent capabilities allow searching for human, mouse and yeast protein-interaction information.}, doi = {10.1186/1471-2105-4-11}, pdf = {../local/Donaldson2003PreBIND.pdf}, file = {Donaldson2003PreBIND.pdf:local/Donaldson2003PreBIND.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/4/11/abstract} }
@article{Doniger2002Predicting, author = {Doniger, S. and Hofmann, T. and Yeh, J.}, title = {Predicting {CNS} permeability of drug molecules: comparison of neural network and support vector machine algorithms}, journal = {J. {C}omput. {B}iol.}, year = {2002}, volume = {9}, pages = {849-864}, number = {6}, abstract = {Two different machine-learning algorithms have been used to predict the blood-brain barrier permeability of different classes of molecules, to develop a method to predict the ability of drug compounds to penetrate the {CNS}. {T}he first algorithm is based on a multilayer perceptron neural network and the second algorithm uses a support vector machine. {B}oth algorithms are trained on an identical data set consisting of 179 {CNS} active molecules and 145 {CNS} inactive molecules. {T}he training parameters include molecular weight, lipophilicity, hydrogen bonding, and other variables that govern the ability of a molecule to diffuse through a membrane. {T}he results show that the support vector machine outperforms the neural network. {B}ased on over 30 different validation sets, the {SVM} can predict up to 96% of the molecules correctly, averaging 81.5% over 30 test sets, which comprised of equal numbers of {CNS} positive and negative molecules. {T}his is quite favorable when compared with the neural network's average performance of 75.7% with the same 30 test sets. {T}he results of the {SVM} algorithm are very encouraging and suggest that a classification tool like this one will prove to be a valuable prediction approach.}, doi = {10.1089/10665270260518317}, pdf = {../local/Doniger2002Predicting.pdf}, file = {Doniger2002Predicting.pdf:local/Doniger2002Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Dror2005Accurate, author = {Dror, G. and Sorek, R. and Shamir, R.}, title = {Accurate identification of alternatively spliced exons using support vector machine}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {897-901}, number = {7}, month = {Apr}, abstract = {Motivation: {A}lternative splicing is a major component of the regulation acting on mammalian transcriptomes. {I}t is estimated that over half of all human genes have more than one splice variant. {P}revious studies have shown that alternatively spliced exons possess several features that distinguish them from constitutively spliced ones. {R}ecently, we have demonstrated that such features can be used to distinguish alternative from constitutive exons. {I}n the current study we use advanced machine learning methods to generate robust alternative exons classifier.{R}esults: {W}e extracted several hundred local sequence features of constitutive as well as alternative exons. {U}sing feature selection methods we find seven attributes that are dominant for the task of classification. {S}everal less informative features help to slightly increase the performance of the classifier. {T}he classifier achieves a true positive rate of 50% for a false positive rate of 0.5%. {T}his result enables one to reliably identify alternatively spliced exons in exon databases that are believed to be dominated by constitutive exons.{A}vailability: {U}pon request from the authors.}, doi = {10.1093/bioinformatics/bti132}, pdf = {../local/Dror2005Accurate.pdf}, file = {Dror2005Accurate.pdf:local/Dror2005Accurate.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti132v1} }
@article{Dubey2005Support, author = {Anshul Dubey and Matthew J Realff and Jay H Lee and Andreas S Bommarius}, title = {Support vector machines for learning to identify the critical positions of a protein.}, journal = {J {T}heor {B}iol}, year = {2005}, volume = {234}, pages = {351-61}, number = {3}, month = {Jun}, abstract = {A method for identifying the positions in the amino acid sequence, which are critical for the catalytic activity of a protein using support vector machines ({SVM}s) is introduced and analysed. {SVM}s are supported by an efficient learning algorithm and can utilize some prior knowledge about the structure of the problem. {T}he amino acid sequences of the variants of a protein, created by inducing mutations, along with their fitness are required as input data by the method to predict its critical positions. {T}o investigate the performance of this algorithm, variants of the beta-lactamase enzyme were created in silico using simulations of both mutagenesis and recombination protocols. {R}esults from literature on beta-lactamase were used to test the accuracy of this method. {I}t was also compared with the results from a simple search algorithm. {T}he algorithm was also shown to be able to predict critical positions that can tolerate two different amino acids and retain function.}, doi = {10.1016/j.jtbi.2004.11.037}, pdf = {../local/Dubey2005Support.pdf}, file = {Dubey2005Support.pdf:local/Dubey2005Support.pdf:PDF}, keywords = {biosvm}, pii = {S0022-5193(04)00585-5}, url = {http://dx.doi.org/10.1016/j.jtbi.2004.11.037} }
@article{Donnes2002Prediction, author = {D{\"o}nnes, P. and Elofsson, A.}, title = {Prediction of {MHC} class {I} binding peptides, using {SVMHC}}, journal = {B{MC} {B}ioinformatics}, year = {2002}, volume = {3}, pages = {25}, number = {1}, month = {Sep}, abstract = {Background {T}-cells are key players in regulating a specific immune response. {A}ctivation of cytotoxic {T}-cells requires recognition of specific peptides bound to {M}ajor {H}istocompatibility {C}omplex ({MHC}) class {I} molecules. {MHC}-peptide complexes are potential tools for diagnosis and treatment of pathogens and cancer, as well as for the development of peptide vaccines. {O}nly one in 100 to 200 potential binders actually binds to a certain {MHC} molecule, therefore a good prediction method for {MHC} class {I} binding peptides can reduce the number of candidate binders that need to be synthesized and tested. {R}esults {H}ere, we present a novel approach, {SVMHC}, based on support vector machines to predict the binding of peptides to {MHC} class {I} molecules. {T}his method seems to perform slightly better than two profile based methods, {SYFPEITHI} and {HLA}_{BIND}. {T}he implementation of {SVMHC} is quite simple and does not involve any manual steps, therefore as more data become available it is trivial to provide prediction for more {MHC} types. {SVMHC} currently contains prediction for 26 {MHC} class {I} types from the {MHCPEP} database or alternatively 6 {MHC} class {I} types from the higher quality {SYFPEITHI} database. {T}he prediction models for these {MHC} types are implemented in a public web service available at http://www.sbc.su.se/svmhc/. {C}onclusions {P}rediction of {MHC} class {I} binding peptides using {S}upport {V}ector {M}achines, shows high performance and is easy to apply to a large number of {MHC} class {I} types. {A}s more peptide data are put into {MHC} databases, {SVMHC} can easily be updated to give prediction for additional {MHC} class {I} types. {W}e suggest that the number of binding peptides needed for {SVM} training is at least 20 sequences.}, doi = {10.1186/1471-2105-3-25}, pdf = {../local/Donnes2002Prediction.pdf}, file = {Donnes2002Prediction.pdf:local/Donnes2002Prediction.pdf:PDF}, keywords = {biosvm immunoinformatics}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/3/25/abstract} }
@article{Donnes2005Integrated, author = {D{\"o}nnes, P. and Kohlbacher, O.}, title = {Integrated modeling of the major events in the {MHC} class {I} antigen processing pathway.}, journal = {Protein {S}ci.}, year = {2005}, volume = {14}, pages = {2132-2140}, month = {Jun}, abstract = {Rational design of epitope-driven vaccines is a key goal of immunoinformatics. {T}ypically, candidate selection relies on the prediction of {MHC}-peptide binding only, as this is known to be the most selective step in the {MHC} class {I} antigen processing pathway. {H}owever, proteasomal cleavage and transport by the transporter associated with antigen processing ({TAP}) are essential steps in antigen processing as well. {W}hile prediction methods exist for the individual steps, no method has yet offered an integrated prediction of all three major processing events. {H}ere we present {WAPP}, a method combining prediction of proteasomal cleavage, {TAP} transport, and {MHC} binding into a single prediction system. {T}he proteasomal cleavage site prediction employs a new matrix-based method that is based on experimentally verified proteasomal cleavage sites. {S}upport vector regression is used for predicting peptides transported by {TAP}. {MHC} binding is the last step in the antigen processing pathway and was predicted using a support vector machine method, {SVMHC}. {T}he individual methods are combined in a filtering approach mimicking the natural processing pathway. {WAPP} thus predicts peptides that are cleaved by the proteasome at the {C} terminus, transported by {TAP}, and show significant affinity to {MHC} class {I} molecules. {T}his results in a decrease in false positive rates compared to {MHC} binding prediction alone. {C}ompared to prediction of {MHC} binding only, we report an increased overall accuracy and a lower rate of false positive predictions for the {HLA}-{A}*0201, {HLA}-{B}*2705, {HLA}-{A}*01, and {HLA}-{A}*03 alleles using {WAPP}. {T}he method is available online through our prediction server at http://www-bs.informatik.uni-tuebingen.de/{WAPP}.}, doi = {10.1110/ps.051352405}, pdf = {../local/Donnes2005Integrated.pdf}, file = {Donnes2005Integrated.pdf:local/Donnes2005Integrated.pdf:PDF}, keywords = {biosvm immunoinformatics}, pii = {ps.051352405}, url = {http://dx.doi.org/10.1110/ps.051352405} }
@article{Fong2004Predicting, author = {Fong, J. H. and Keating, A. E. and Singh, M.}, title = {Predicting specificity in b{ZIP} coiled-coil protein interactions}, journal = {Genome {B}iol.}, year = {2004}, volume = {5}, number = {R11}, abstract = {We present a method for predicting protein-protein interactions mediated by the coiled-coil motif. {W}hen tested on interactions between nearly all human and yeast b{ZIP} proteins, our method identifies 70% of strong interactions while maintaining that 92% of predictions are correct. {F}urthermore, cross-validation testing shows that including the b{ZIP} experimental data significantly improves performance. {O}ur method can be used to predict b{ZIP} interactions in other genomes and is a promising approach for predicting coiled-coil interactions more generally.}, pdf = {../local/Fong2004Predicting.pdf}, file = {Fong2004Predicting.pdf:local/Fong2004Predicting.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://genomebiology.com/2004/5/2/R11} }
@article{Friedel2005Support, author = {Friedel, C. C. and Jahn, K. H. V. and Sommer, S. and Rudd, S. and Mewes, H. W. and Tetko, I. V.}, title = {Support vector machines for separation of mixed plant-pathogen {EST} collections based on codon usage}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1383-1388}, abstract = {Motivation: {D}iscovery of host and pathogen genes expressed at the plant-pathogen interface often requires the construction of mixed libraries that contain sequences from both genomes. {S}equence identification requires high-throughput and reliable classification of genome origin. {W}hen using single-pass c{DNA} sequences difficulties arise from the short sequence length, the lack of sufficient taxonomically relevant sequence data in public databases and ambiguous sequence homology between plant and pathogen genes.{R}esults: {A} novel method is described, which is independent of the availability of homologous genes and relies on subtle differences in codon usage between plant and fungal genes. {W}e used support vector machines ({SVM}s) to identify the probable origin of sequences. {SVM}s were compared to several other machine learning techniques and to a probabilistic algorithm ({PF}-{IND}, {M}aor et al., 2003) for {EST} classification also based on codon bias differences. {O}ur software ({ECLAT}) has achieved a classification accuracy of 93.1% on a test set of 3217 {EST} sequences from {H}. vulgare and {B}. graminis, which is a significant improvement compared to {PF}-{IND} (prediction accuracy of 81.2% on the same test set). {EST} sequences with at least 50 nt of coding sequence can be classified by {ECLAT} with high confidence. {ECLAT} allows training of classifiers for any host-pathogen combination for which there are sufficient classified training sequences.{A}vailability: {ECLAT} is freely available on the internet (http://mips.gsf.de/proj/est) or on request as a standalone version.}, doi = {10.1093/bioinformatics/bti200}, pdf = {../local/Friedel2005Support.pdf}, file = {Friedel2005Support.pdf:local/Friedel2005Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti200v1} }
@article{Fritz2002Microarray-based, author = {Fritz, B. and Schubert, F. and Wrobel, G. and Schwaenen, C. and Wessendorf, S. and Nessling, M. and Korz, C. and Rieker, R. J. and Montgomery, K. and Kucherlapati, R. and Mechtersheimer, G. and Eils, R. and Joos, S. and Lichter, P.}, title = {Microarray-based {C}opy {N}umber and {E}xpression {P}rofiling in {D}edifferentiated and {P}leomorphic {L}iposarcoma}, journal = {Cancer {R}es.}, year = {2002}, volume = {62}, pages = {2993-2998}, number = {11}, abstract = {Sixteen dedifferentiated and pleomorphic liposarcomas were analyzed by comparative genomic hybridization ({CGH}) to genomic microarrays (matrix-{CGH}), c{DNA}-derived microarrays for expression profiling, and by quantitative {PCR}. {M}atrix-{CGH} revealed copy number gains of numerous oncogenes, i.e., {CCND}1, {MDM}2, {GLI}, {CDK}4, {MYB}, {ESR}1, and {AIB}1, several of which correlate with a high level of transcripts from the respective gene. {I}n addition, a number of genes were found differentially expressed in dedifferentiated and pleomorphic liposarcomas. {A}pplication of dedicated clustering algorithms revealed that both tumor subtypes are clearly separated by the genomic profiles but only with a lesser power by the expression profiles. {U}sing a support vector machine, a subset of five clones was identified as "class discriminators." {T}hus, for the distinction of these types of liposarcomas, genomic profiling appears to be more advantageous than {RNA} expression analysis.}, pdf = {../local/Fritz2002Microarray-based.pdf}, file = {Fritz2002Microarray-based.pdf:local/Fritz2002Microarray-based.pdf:PDF}, keywords = {biosvm, cgh}, owner = {jeanphilippevert}, url = {http://cancerres.aacrjournals.org/cgi/content/abstract/62/11/2993} }
@article{Furey2000Support, author = {Furey, T. S. and Cristianini, N. and Duffy, N. and Bednarski, D. W. and Schummer, M. and Haussler, D.}, title = {Support vector machine classification and validation of cancer tissue samples using microarray expression data}, journal = {Bioinformatics}, year = {2000}, volume = {16}, pages = {906-914}, number = {10}, month = {Oct}, abstract = {Motivation: {DNA} microarray experiments generating thousands of gene expression measurements, are being used to gather information from tissue and cell samples regarding gene expression differences that will be useful in diagnosing disease. {W}e have developed a new method to analyse this kind of data using support vector machines ({SVM}s). {T}his analysis consists of both classification of the tissue samples, and an exploration of the data for mis-labeled or questionable tissue results. {R}esults: {W}e demonstrate the method in detail on samples consisting of ovarian cancer tissues, normal ovarian tissues, and other normal tissues. {T}he dataset consists of expression experiment results for 97802 c{DNA}s for each tissue. {A}s a result of computational analysis, a tissue sample is discovered and confirmed to be wrongly labeled. {U}pon correction of this mistake and the removal of an outlier, perfect classification of tissues is achieved, but not with high confidence. {W}e identify and analyse a subset of genes from the ovarian dataset whose expression is highly differentiated between the types of tissues. {T}o show robustness of the {SVM} method, two previously published datasets from other types of tissues or cells are analysed. {T}he results are comparable to those previously obtained. {W}e show that other machine learning methods also perform comparably to the {SVM} on many of those datasets. {A}vailability: {T}he {SVM} software is available at http://www.cs.columbia.edu/~bgrundy/svm. {C}ontact: booch@cse.ucsc.edu}, pdf = {../local/Furey2000Support.pdf}, file = {Furey2000Support.pdf:local/Furey2000Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/16/10/906} }
@article{Furlanello2003Entropy-based, author = {Furlanello, C. and Serafini, M. and Merler, S. and Jurman, G.}, title = {Entropy-based gene ranking without selection bias for the predictive classification of microarray data}, journal = {B{MC} {B}ioinformatics}, year = {2003}, volume = {4}, number = {54}, abstract = {Background {W}e describe the {E}-{RFE} method for gene ranking, which is useful for the identification of markers in the predictive classification of array data. {T}he method supports a practical modeling scheme designed to avoid the construction of classification rules based on the selection of too small gene subsets (an effect known as the selection bias, in which the estimated predictive errors are too optimistic due to testing on samples already considered in the feature selection process). {R}esults {W}ith {E}-{RFE}, we speed up the recursive feature elimination ({RFE}) with {SVM} classifiers by eliminating chunks of uninteresting genes using an entropy measure of the {SVM} weights distribution. {A}n optimal subset of genes is selected according to a two-strata model evaluation procedure: modeling is replicated by an external stratified-partition resampling scheme, and, within each run, an internal {K}-fold cross-validation is used for {E}-{RFE} ranking. {A}lso, the optimal number of genes can be estimated according to the saturation of {Z}ipf's law profiles. {C}onclusions {W}ithout a decrease of classification accuracy, {E}-{RFE} allows a speed-up factor of 100 with respect to standard {RFE}, while improving on alternative parametric {RFE} reduction strategies. {T}hus, a process for gene selection and error estimation is made practical, ensuring control of the selection bias, and providing additional diagnostic indicators of gene importance.}, doi = {10.1186/1471-2105-4-54}, pdf = {../local/Furlanello2003Entropy-based.pdf}, file = {Furlanello2003Entropy-based.pdf:local/Furlanello2003Entropy-based.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/4/54} }
@article{Gangal2005Human, author = {Rajeev Gangal and Pankaj Sharma}, title = {Human pol {II} promoter prediction: time series descriptors and machine learning.}, journal = {Nucleic {A}cids {R}es}, year = {2005}, volume = {33}, pages = {1332-6}, number = {4}, abstract = {Although several in silico promoter prediction methods have been developed to date, they are still limited in predictive performance. {T}he limitations are due to the challenge of selecting appropriate features of promoters that distinguish them from non-promoters and the generalization or predictive ability of the machine-learning algorithms. {I}n this paper we attempt to define a novel approach by using unique descriptors and machine-learning methods for the recognition of eukaryotic polymerase {II} promoters. {I}n this study, non-linear time series descriptors along with non-linear machine-learning algorithms, such as support vector machine ({SVM}), are used to discriminate between promoter and non-promoter regions. {T}he basic idea here is to use descriptors that do not depend on the primary {DNA} sequence and provide a clear distinction between promoter and non-promoter regions. {T}he classification model built on a set of 1000 promoter and 1500 non-promoter sequences, showed a 10-fold cross-validation accuracy of 87\% and an independent test set had an accuracy >85\% in both promoter and non-promoter identification. {T}his approach correctly identified all 20 experimentally verified promoters of human chromosome 22. {T}he high sensitivity and selectivity indicates that n-mer frequencies along with non-linear time series descriptors, such as {L}yapunov component stability and {T}sallis entropy, and supervised machine-learning methods, such as {SVM}s, can be useful in the identification of pol {II} promoters.}, doi = {10.1093/nar/gki271}, pdf = {../local/Gangal2005Human.pdf}, file = {Gangal2005Human.pdf:local/Gangal2005Human.pdf:PDF}, keywords = {biosvm}, pii = {33/4/1332}, url = {http://dx.doi.org/10.1093/nar/gki271} }
@article{Gardy2005PSORTb, author = {Gardy, J. L. and Laird, M. R. and Chen, F. and Rey, S. and Walsh, C. J. and Ester, M. and Brinkman, F. S. L.}, title = {{{PSORT}b v.2.0}: expanded prediction of bacterial protein subcellular localization and insights gained from comparative proteome analysis}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {617-623}, number = {5}, month = {Mar}, abstract = {Motivation: {PSORT}b v.1.1 is the most precise bacterial localization prediction tool available. {H}owever the program's predictive coverage and recall are low and the method is only applicable to {G}ram-negative bacteria. {T}he goals of the present work were: increase {PSORT}b's coverage while maintaining the existing precision level, expand it to include {G}ram-positive bacteria, and then carry out a comparative analysis of localization.{R}esults: {A}n expanded database of proteins of known localization and new modules using frequent subsequence-based support vector machines were introduced into {PSORT}b v.2.0. {T}he program attains a precision of 96% for {G}ram-positive and {G}ram-negative bacteria and predictive coverage comparable to other tools for whole proteome analysis. {W}e show that the proportion of proteins at each localization is remarkably consistent across species, even in species with varying proteome size.{A}vailability: {W}eb-based version: http://www.psort.org/psortb. {S}tandalone version: {A}vailable through the website under {GNU} {G}eneral {P}ublic {L}icense.{S}upplementary {I}nformation: http://www.psort.org/psortb/supplementaryinfo.html.}, doi = {10.1093/bioinformatics/bti057}, pdf = {../local/Gardy2005PSORTb.pdf}, file = {Gardy2005PSORTb.pdf:local/Gardy2005PSORTb.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/bioinformatics/bti057} }
@article{Garg2005SVM-based, author = {Garg, A. and Bhasin, M. and Raghava, G.P.}, title = {S{VM}-based method for subcellular localization of human proteins using amino acid compositions, their order and similarity search}, journal = {J. {B}iol. {C}hem.}, year = {2005}, volume = {280}, pages = {14427-32}, number = {15}, month = {Apr}, abstract = {Here we report a systematic approach for predicting subcellular localization (cytoplasm, mitochondrial, nuclear and plasma membrane) of human proteins. {F}irstly, {SVM} based modules for predicting subcellular localization using traditional amino acid and dipeptide (i+1) composition achieved overall accuracy of 76.6% and 77.8%, respectively. {PSI}-{BLAST} when carried out using similarity-based search against non-redundant database of experimentally annotated proteins yielded 73.3% accuracy. {T}o gain further insight, hybrid module (hybrid1) was developed based on amino acid composition, dipeptide composition, and similarity information and attained better accuracy of 84.9%. {I}n addition, {SVM} module based on different higher order dipeptide i.e. i+2, i+3, and i+4 were also constructed for the prediction of subcellular localization of human proteins and overall accuracy of 79.7%, 77.5% and 77.1% was accomplished respectively. {F}urthermore, another {SVM} module hybrid2 was developed using traditional dipeptide (i+1) and higher order dipeptide (i+2, i+3, and i+4) compositions, which gave an overall accuracy of 81.3%. {W}e also developed {SVM} module hybrid3 based on amino acid composition, traditional and higher order dipeptide compositions and {PSI}-{BLAST} output and achieved an overall accuracy of 84.4%. {A} web server {HSLP}red (http://www.imtech.res.in/raghava/hslpred/ or http://bioinformatics.uams.edu/raghava/hslpred/) has been designed to predict subcellular localization of human proteins using the above approaches.}, doi = {10.1074/jbc.M411789200}, pdf = {../local/Garg2005SVM-based.pdf}, file = {Garg2005SVM-based.pdf:local/Garg2005SVM-based.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1074/jbc.M411789200} }
@article{Gaudan2005Resolving, author = {Gaudan, S. and Kirsch, H. and Rebholz-Schuhmann, D.}, title = {Resolving abbreviations to their senses in {M}edline.}, journal = {Bioinformatics}, year = {2005}, month = {Jul}, abstract = {M{OTIVATION}: {B}iological literature contains many abbreviations with one particular sense in each document. {H}owever, most abbreviations do not have a unique sense across the literature. {F}urthermore, many documents do not contain the long-forms of the abbreviations. {R}esolving an abbreviation in a document consists of retrieving its sense in use. {A}bbreviation resolution improves accuracy of document retrieval engines and of information extraction systems. {RESULTS}: {W}e combine an automatic analysis of {M}edline abstracts and linguistic methods to build a dictionary of abbreviation/sense pairs. {T}he dictionary is used for the resolution of abbreviations occurring with their long-forms. {A}mbiguous global abbreviations are resolved using {S}upport {V}ector {M}achines that have been trained on the context of each instance of the abbreviation/sense pairs, previously extracted for the dictionary setup. {T}he system disambiguates abbreviations with a precision of 98.9\% for a recall of 98.2\% (98.5\% accuracy). {T}his performance is superior in comparison to previously reported research work. {AVAILABILITY}: {T}he abbreviation resolution module is available at http://www.ebi.ac.uk/{R}ebholz/software.html.}, doi = {10.1093/bioinformatics/bti586}, pdf = {../local/Gaudan2005Resolving.pdf}, file = {Gaudan2005Resolving.pdf:local/Gaudan2005Resolving.pdf:PDF}, keywords = {biosvm nlp}, pii = {bti586}, url = {http://dx.doi.org/10.1093/bioinformatics/bti586} }
@article{Gomez2003Learning, author = {Gomez, S. M. and Noble, W. S. and Rzhetsky, A.}, title = {Learning to predict protein-protein interactions from protein sequences}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1875-1881}, number = {15}, abstract = {In order to understand the molecular machinery of the cell, we need to know about the multitude of protein-protein interactions that allow the cell to function. {H}igh-throughput technologies provide some data about these interactions, but so far that data is fairly noisy. {T}herefore, computational techniques for predicting protein-protein interactions could be of significant value. {O}ne approach to predicting interactions in silico is to produce from first principles a detailed model of a candidate interaction. {W}e take an alternative approach, employing a relatively simple model that learns dynamically from a large collection of data. {I}n this work, we describe an attraction-repulsion model, in which the interaction between a pair of proteins is represented as the sum of attractive and repulsive forces associated with small, domain- or motif-sized features along the length of each protein. {T}he model is discriminative, learning simultaneously from known interactions and from pairs of proteins that are known (or suspected) not to interact. {T}he model is efficient to compute and scales well to very large collections of data. {I}n a cross-validated comparison using known yeast interactions, the attraction-repulsion method performs better than several competing techniques.}, pdf = {../local/Gomez2003Learning.pdf}, file = {Gomez2003Learning.pdf:local/Gomez2003Learning.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1875} }
@article{Gordon2003Sequence, author = {Gordon, L. and Chervonenkis, A. Y. and Gammerman, A. J. and Shahmuradov, I. A. and Solovyev, V. V.}, title = {Sequence alignment kernel for recognition of promoter regions}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1964-1971}, number = {15}, abstract = {In this paper we propose a new method for recognition of prokaryotic promoter regions with startpoints of transcription. {T}he method is based on {S}equence {A}lignment {K}ernel, a function reflecting the quantitative measure of match between two sequences. {T}his kernel function is further used in {D}ual {SVM}, which performs the recognition. {S}everal recognition methods have been trained and tested on positive data set, consisting of 669 {sigma}70-promoter regions with known transcription startpoints of {E}scherichia coli and two negative data sets of 709 examples each, taken from coding and non-coding regions of the same genome. {T}he results show that our method performs well and achieves 16.5% average error rate on positive & coding negative data and 18.6% average error rate on positive & non-coding negative data. {A}vailability:{T}he demo version of our method is accessible from our website http://mendel.cs.rhul.ac.uk/}, pdf = {../local/Gordon2003Sequence.pdf}, file = {Gordon2003Sequence.pdf:local/Gordon2003Sequence.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1964} }
@article{Guermeur2002Combining, author = {Guermeur, Y.}, title = {Combining {D}iscriminant {M}odels with {N}ew {M}ulti-{C}lass {SVM}s}, journal = {Pattern {A}nal. {A}ppl.}, year = {2002}, volume = {5}, pages = {168-179}, number = {2}, abstract = {The idea of performing model combination, instead of model selection, has a long theoretical background in statistics. {H}owever, making use of theoretical results is ordinarily subject to the satisfaction of strong hypotheses (weak error correlation, availability of large training sets, possibility to rerun the training procedure an arbitrary number of times, etc.). {I}n contrast, the practitioner is frequently faced with the problem of combining a given set of pre-trained classifiers, with highly correlated errors, using only a small training sample. {O}verfitting is then the main risk, which cannot be overcome but with a strict complexity control of the combiner selected. {T}his suggests that {SVM}s should be well suited for these difficult situations. {I}nvestigating this idea, we introduce a family of multi-class {SVM}s and assess them as ensemble methods on a real-world problem. {T}his task, protein secondary structure prediction, is an open problem in biocomputing for which model combination appears to be an issue of central importance. {E}xperimental evidence highlights the gain in quality resulting from combining some of the most widely used prediction methods with our {SVM}s rather than with the ensemble methods traditionally used in the field. {T}he gain increases when the outputs of the combiners are post-processed with a {DP} algorithm.}, doi = {10.1007/s100440200015}, pdf = {../local/Guermeur2002Combining.pdf}, file = {Guermeur2002Combining.pdf:local/Guermeur2002Combining.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1007/s100440200015} }
@incollection{Guermeur2004kernel, author = {Guermeur, Y. and Lifschitz, A. and Vert, R.}, title = {A kernel for protein secondary structure prediction}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {193-206}, keywords = {biosvm}, owner = {vert} }
@article{Guermeur2004Combining, author = {Guermeur, Y. and Pollastri, G. and Elisseeff, A. and Zelus, D. and Paugam-Moisy, H. and Baldi, P.}, title = {Combining protein secondary structure prediction models with ensemble methods of optimal complexity}, journal = {Neurocomputing}, year = {2004}, volume = {56}, pages = {305-327}, abstract = {Many sophisticated methods are currently available to perform protein secondary structure prediction. {S}ince they are frequently based on different principles, and different knowledge sources, significant benefits can be expected from combining them. {H}owever, the choice of an appropriate combiner appears to be an issue in its own right. {T}he first difficulty to overcome when combining prediction methods is overfitting. {T}his is the reason why we investigate the implementation of {S}upport {V}ector {M}achines to perform the task. {A} family of multi-class {SVM}s is introduced. {T}wo of these machines are used to combine some of the current best protein secondary structure prediction methods. {T}heir performance is consistently superior to the performance of the ensemble methods traditionally used in the field. {T}hey also outperform the decomposition approaches based on bi-class {SVM}s. {F}urthermore, initial experimental evidence suggests that their outputs could be processed by the biologist to perform higher-level treatments.}, doi = {10.1016/j.neucom.2003.10.004}, pdf = {../local/Guermeur2004Combining.pdf}, file = {Guermeur2004Combining.pdf:local/Guermeur2004Combining.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.neucom.2003.10.004} }
@article{Guo2004novel, author = {Guo, J. and Chen, H. and Sun, Z. and Lin, Y.}, title = {A novel method for protein secondary structure prediction using dual-layer {SVM} and profiles}, journal = {Proteins}, year = {2004}, volume = {54}, pages = {738-743}, number = {4}, abstract = {A high-performance method was developed for protein secondary structure prediction based on the dual-layer support vector machine ({SVM}) and position-specific scoring matrices ({PSSM}s). {SVM} is a new machine learning technology that has been successfully applied in solving problems in the field of bioinformatics. {T}he {SVM}'s performance is usually better than that of traditional machine learning approaches. {T}he performance was further improved by combining {PSSM} profiles with the {SVM} analysis. {T}he {PSSM}s were generated from {PSI}-{BLAST} profiles, which contain important evolution information. {T}he final prediction results were generated from the second {SVM} layer output. {O}n the {CB}513 data set, the three-state overall per-residue accuracy, {Q}3, reached 75.2%, while segment overlap ({SOV}) accuracy increased to 80.0%. {O}n the {CB}396 data set, the {Q}3 of our method reached 74.0% and the {SOV} reached 78.1%. {A} web server utilizing the method has been constructed and is available at http://www.bioinfo.tsinghua.edu.cn/pmsvm.}, doi = {10.1002/prot.10634Â }, pdf = {../local/Guo2004novel.pdf}, file = {Guo2004novel.pdf:local/Guo2004novel.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.10634Â } }
@article{Guo2005novel, author = {Ting Guo and Yanxin Shi and Zhirong Sun}, title = {A novel statistical ligand-binding site predictor: application to {ATP}-binding sites.}, journal = {Protein {E}ng {D}es {S}el}, year = {2005}, volume = {18}, pages = {65-70}, number = {2}, month = {Feb}, abstract = {Structural genomics initiatives are leading to rapid growth in newly determined protein 3{D} structures, the functional characterization of which may still be inadequate. {A}s an attempt to provide insights into the possible roles of the emerging proteins whose structures are available and/or to complement biochemical research, a variety of computational methods have been developed for the screening and prediction of ligand-binding sites in raw structural data, including statistical pattern classification techniques. {I}n this paper, we report a novel statistical descriptor (the {O}riented {S}hell {M}odel) for protein ligand-binding sites, which utilizes the distance and angular position distribution of various structural and physicochemical features present in immediate proximity to the center of a binding site. {U}sing the support vector machine ({SVM}) as the classifier, our model identified 69\% of the {ATP}-binding sites in whole-protein scanning tests and in eukaryotic proteins the accuracy is particularly high. {W}e propose that this feature extraction and machine learning procedure can screen out ligand-binding-capable protein candidates and can yield valuable biochemical information for individual proteins.}, doi = {10.1093/protein/gzi006}, pdf = {../local/Guo2005novel.pdf}, file = {Guo2005novel.pdf:local/Guo2005novel.pdf:PDF}, keywords = {biosvm}, pii = {gzi006}, url = {http://dx.doi.org/10.1093/protein/gzi006} }
@article{Gururaja2003Multiple, author = {Gururaja, T. and Li, W. and Noble, W.S. and Payan, D.G. and Anderson, D.C.}, title = {Multiple functional categories of proteins identified in an in vitro cellular ubiquitin affinity extract using shotgun peptide sequencing}, journal = {J {P}roteome {R}es}, year = {2003}, volume = {2}, pages = {394-404}, number = {394-404}, abstract = {Using endogenous human cellular ubiquitin system enzymes and added his-tagged ubiquitin, {ATP}, and an {ATP}-regenerating system, we labelled cellular proteins with hexahistidine tagged ubiquitin in vitro. {L}abeling was dependent on {ATP} and the {ATP} recycling system, on the proteasome inhibitor {MG}132 and the ubiquitin protease inhibitor ubiquitin aldehyde, and was inhibited by iodoacetamide. {L}abeled proteins were affinity extracted in quadruplicate and tryptic peptides identifed by 2{D} capillary {LC}/{MS}/{MS} comb9ined with {SEQUEST} and {MEDUSA} analyses. {S}upport vector machine analyais of the mass spectrometry data allowed prediction of correct matches between mass spectrometry data and peptide sequences. {O}verall, 144 proteins were identified by peptides predicted to be correctly sequenced, and 113 were identified by at least three peptides or one or two peptides with at least an 80% chance of being correct. {I}dentified proteins included 22 proteasome subunits or associated proteins, 18 {E}1, {E}2 or {E}3 ubiquitin system enzymes or related proteins, and four ubiquitin domain proteins. {S}eventeen directly ubiquitinated proteins or proteins associated with the ubiquitin system were identified. {F}unctional clusters of other proteins included redox enzymes, proteins associated with endocytosis, cytoskeletal proteins, {DNA} damage or repair related proteins, calcium binding proteins, and splicing factor and related proteins, suggesting that in vitro ubiquitination is not random, and that these functions may be regulated by the ubiquitin system. {T}his map of cellular ubiquitinated proteins and their interacting proteins will be useful for further studies of ubiquitin system function.}, pdf = {../local/Gururaja2003Multiple.pdf}, file = {Gururaja2003Multiple.pdf:local/Gururaja2003Multiple.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Guyon2002Gene, author = {Guyon, I. and Weston, J. and Barnhill, S. and Vapnik, V.}, title = {Gene selection for cancer classification using support vector machines}, journal = {Mach. Learn.}, year = {2002}, volume = {46}, pages = {389-422}, number = {1/3}, month = {Jan}, abstract = {D{NA} micro-arrays now permit scientists to screen thousands of genes simultaneously and determine whether those genes are active, hyperactive or silent in normal or cancerous tissue. {B}ecause these new micro-array devices generate bewildering amounts of raw data, new analytical methods must be developed to sort out whether cancer tissues have distinctive signatures of gene expression over normal tissues or other types of cancer tissues. {I}n this paper, we address the problem of selection of a small subset of genes from broad patterns of gene expression data, recorded on {DNA} micro-arrays. {U}sing available training examples from cancer and normal patients, we build a classifier suitable for genetic diagnosis, as well as drug discovery. {P}revious attempts to address this problem select genes with correlation techniques. {W}e propose a new method of gene selection utilizing {S}upport {V}ector {M}achine methods based on {R}ecursive {F}eature {E}limination ({RFE}). {W}e demonstrate experimentally that the genes selected by our techniques yield better classification performance and are biologically relevant to cancer. {I}n contrast with the baseline method, our method eliminates gene redundancy automatically and yields better and more compact gene subsets. {I}n patients with leukemia our method discovered 2 genes that yield zero leave-one-out error, while 64 genes are necessary for the baseline method to get the best result (one leave-one-out error). {I}n the colon cancer database, using only 4 genes our method is 98% accurate, while the baseline method is only 86% accurate.}, pdf = {../local/Guyon2002Gene.pdf}, file = {Guyon2002Gene.pdf:local/Guyon2002Gene.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://homepages.nyu.edu/~jaw281/genesel.pdf} }
@article{Gartner2004Kernels, author = {G{\"a}rtner, T. and Lloyd, J.W. and Flach, P.A.}, title = {Kernels and Distances for Structured Data}, journal = {Mach. Learn.}, year = {2004}, volume = {57}, pages = {205-232}, number = {3}, abstract = {This paper brings together two strands of machine learning of increasing importance: kernel methods and highly structured data. We propose a general method for constructing a kernel following the syntactic structure of the data, as defined by its type signature in a higher-order logic. Our main theoretical result is the positive definiteness of any kernel thus defined. We report encouraging experimental results on a range of real-world data sets. By converting our kernel to a distance pseudo-metric for 1-nearest neighbour, we were able to improve the best accuracy from the literature on the Diterpene data set by more than 10%.}, doi = {10.1023/B:MACH.0000039777.23772.30}, keywords = {biosvm}, timestamp = {2006.07.11}, url = {http://dx.doi.org/10.1023/B:MACH.0000039777.23772.30} }
@article{Haferlach2005AML, author = {Torsten Haferlach and Alexander Kohlmann and Susanne Schnittger and Martin Dugas and Wolfgang Hiddemann and Wolfgang Kern and Claudia Schoch}, title = {A{ML} {M}3 and {AML} {M}3 variant each have a distinct gene expression signature but also share patterns different from other genetically defined {AML} subtypes.}, journal = {Genes {C}hromosomes {C}ancer}, year = {2005}, volume = {43}, pages = {113-27}, number = {2}, month = {Jun}, abstract = {Acute promyelocytic leukemia ({APL}) with t(15;17) appears in two phenotypes: {AML} {M}3, with abnormal promyelocytes showing heavy granulation and bundles of {A}uer rods, and {AML} {M}3 variant ({M}3v), with non- or hypogranular cytoplasm and a bilobed nucleus. {W}e investigated the global gene expression profiles of 35 {APL} patients (19 {AML} {M}3, 16 {AML} {M}3v) by using high-density {DNA}-oligonucleotide microarrays. {F}irst, an unsupervised approach clearly separated {APL} samples from other {AML}s characterized genetically as t(8;21) (n = 35), inv(16) (n = 35), or t(11q23)/{MLL} (n = 35) or as having a normal karyotype (n = 50). {S}econd, we found genes with functional relevance for blood coagulation that were differentially expressed between {APL} and other {AML}s. {F}urthermore, a supervised pairwise comparison between {M}3 and {M}3v revealed differential expression of genes that encode for biological functions and pathways such as granulation and maturation of hematologic cells, explaining morphologic and clinical differences. {D}iscrimination between {M}3 and {M}3v based on gene signatures showed a median classification accuracy of 90\% by use of 10-fold {CV} and support vector machines. {A}dditional molecular mutations such as {FLT}3-{LM}, which were significantly more frequent in {M}3v than in {M}3 ({P} < 0.0001), may partly contribute to the different phenotypes. {H}owever, linear regression analysis demonstrated that genes differentially expressed between {M}3 and {M}3v did not correlate with {FLT}3-{LM}.}, doi = {10.1002/gcc.20175}, pdf = {../local/Haferlach2005AML.pdf}, file = {Haferlach2005AML.pdf:local/Haferlach2005AML.pdf:PDF}, keywords = {biosvm microarray}, url = {http://dx.doi.org/10.1002/gcc.20175} }
@article{Haferlach2005global, author = {Torsten Haferlach and Alexander Kohlmann and Susanne Schnittger and Martin Dugas and Wolfgang Hiddemann and Wolfgang Kern and Claudia Schoch}, title = {A global approach to the diagnosis of leukemia using gene expression profiling.}, journal = {Blood}, year = {2005}, volume = {106}, pages = {1189-1198}, number = {4}, month = {Aug}, abstract = {Accurate diagnosis and classification of leukemias are the bases for the appropriate management of patients. {T}he diagnostic accuracy and efficiency of present methods may be improved by the use of microarrays for gene expression profiling. {W}e analyzed gene expression profiles in bone marrow and peripheral blood samples from 937 patients with all clinically relevant leukemia subtypes (n=892) and non-leukemic controls (n=45) by {U}133{A} and {B} {G}ene{C}hips ({A}ffymetrix). {F}or each subgroup differentially expressed genes were calculated. {C}lass prediction was performed using support vector machines. {P}rediction accuracies were estimated by 10-fold cross validation and assessed for robustness in a 100-fold resampling approach using randomly chosen test-sets consisting of 1/3 of the samples. {A}pplying the top 100 genes of each subgroup an overall prediction accuracy of 95.1\% was achieved which was confirmed by resampling (median, 93.8\%; 95\% confidence interval, 91.4\%-95.8\%). {I}n particular, {AML} with t(15;17), t(8;21), or inv(16), {CLL}, and {P}ro-{B}-{ALL} with t(11q23) were classified with 100\% sensitivity and 100\% specificity. {A}ccordingly, cluster analysis completely separated all of the 13 subgroups analyzed. {G}ene expression profiling can predict all clinically relevant subentities of leukemia with high accuracy.}, doi = {10.1182/blood-2004-12-4938}, pdf = {../local/Haferlach2005global.pdf}, file = {Haferlach2005global.pdf:local/Haferlach2005global.pdf:PDF}, keywords = {biosvm microarray}, pii = {2004-12-4938}, url = {http://dx.doi.org/10.1182/blood-2004-12-4938} }
@article{Hakenberg2005Systematic, author = {Jörg Hakenberg and Steffen Bickel and Conrad Plake and Ulf Brefeld and Hagen Zahn and Lukas Faulstich and Ulf Leser and Tobias Scheffer}, title = {Systematic feature evaluation for gene name recognition.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6 Suppl 1}, pages = {S9}, abstract = {In task 1{A} of the {B}io{C}re{A}t{I}v{E} evaluation, systems had to be devised that recognize words and phrases forming gene or protein names in natural language sentences. {W}e approach this problem by building a word classification system based on a sliding window approach with a {S}upport {V}ector {M}achine, combined with a pattern-based post-processing for the recognition of phrases. {T}he performance of such a system crucially depends on the type of features chosen for consideration by the classification method, such as pre- or postfixes, character n-grams, patterns of capitalization, or classification of preceding or following words. {W}e present a systematic approach to evaluate the performance of different feature sets based on recursive feature elimination, {RFE}. {B}ased on a systematic reduction of the number of features used by the system, we can quantify the impact of different feature sets on the results of the word classification problem. {T}his helps us to identify descriptive features, to learn about the structure of the problem, and to design systems that are faster and easier to understand. {W}e observe that the {SVM} is robust to redundant features. {RFE} improves the performance by 0.7\%, compared to using the complete set of attributes. {M}oreover, a performance that is only 2.3\% below this maximum can be obtained using fewer than 5\% of the features.}, doi = {10.1186/1471-2105-6-S1-S9}, pdf = {../local/Hakenberg2005Systematic.pdf}, file = {Hakenberg2005Systematic.pdf:local/Hakenberg2005Systematic.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-S1-S9}, url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S9} }
@article{Hakenberg2004Finding, author = {Hakenberg, J. and Schmeier ,S. and Kowald, A. and Klipp, E. and Leser, U.}, title = {Finding kinetic parameters using text mining.}, journal = {O{MICS}}, year = {2004}, volume = {8}, pages = {131-152}, number = {2}, abstract = {The mathematical modeling and description of complex biological processes has become more and more important over the last years. {S}ystems biology aims at the computational simulation of complex systems, up to whole cell simulations. {A}n essential part focuses on solving a large number of parameterized differential equations. {H}owever, measuring those parameters is an expensive task, and finding them in the literature is very laborious. {W}e developed a text mining system that supports researchers in their search for experimentally obtained parameters for kinetic models. {O}ur system classifies full text documents regarding the question whether or not they contain appropriate data using a support vector machine. {W}e evaluated our approach on a manually tagged corpus of 800 documents and found that it outperforms keyword searches in abstracts by a factor of five in terms of precision.}, pdf = {../local/Hakenberg2004Finding.pdf}, file = {Hakenberg2004Finding.pdf:local/Hakenberg2004Finding.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.liebertonline.com/doi/abs/10.1089%2F1536231041388366} }
@article{Han2004Predicting, author = {Han, L.Y. and Cai, C.Z. and Ji, Z.L. and Cao, Z.W. and Cui, J. and Chen, Y.Z.}, title = {Predicting functional family of novel enzymes irrespective of sequence similarity: a statistical learning approach.}, journal = {Nucl. {A}cids {R}es.}, year = {2004}, volume = {32}, pages = {6437-6444}, number = {21}, abstract = {The function of a protein that has no sequence homolog of known function is difficult to assign on the basis of sequence similarity. {T}he same problem may arise for homologous proteins of different functions if one is newly discovered and the other is the only known protein of similar sequence. {I}t is desirable to explore methods that are not based on sequence similarity. {O}ne approach is to assign functional family of a protein to provide useful hint about its function. {S}everal groups have employed a statistical learning method, support vector machines ({SVM}s), for predicting protein functional family directly from sequence irrespective of sequence similarity. {T}hese studies showed that {SVM} prediction accuracy is at a level useful for functional family assignment. {B}ut its capability for assignment of distantly related proteins and homologous proteins of different functions has not been critically and adequately assessed. {H}ere {SVM} is tested for functional family assignment of two groups of enzymes. {O}ne consists of 50 enzymes that have no homolog of known function from {PSI}-{BLAST} search of protein databases. {T}he other contains eight pairs of homologous enzymes of different families. {SVM} correctly assigns 72% of the enzymes in the first group and 62% of the enzyme pairs in the second group, suggesting that it is potentially useful for facilitating functional study of novel proteins. {A} web version of our software, {SVMP}rot, is accessible at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.}, doi = {10.1093/nar/gkh984}, pdf = {../local/Han2004Predicting.pdf}, file = {Han2004Predicting.pdf:local/Han2004Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/nar/gkh984} }
@article{Han2005Prediction, author = {Han, L.Y. and Cai, C.Z. and Ji, Z.L. and Chen, Y.Z.}, title = {Prediction of functional class of novel viral proteins by a statistical learning method irrespective of sequence similarity}, journal = {Virology}, year = {2005}, volume = {331}, pages = {136-143}, number = {1}, abstract = {The function of a substantial percentage of the putative protein-coding open reading frames ({ORF}s) in viral genomes is unknown. {A}s their sequence is not similar to that of proteins of known function, the function of these {ORF}s cannot be assigned on the basis of sequence similarity. {M}ethods complement or in combination with sequence similarity-based approaches are being explored. {T}he web-based software {SVMP}rot () to some extent assigns protein functional family irrespective of sequence similarity and has been found to be useful for studying distantly related proteins [{C}ai, {C}.{Z}., {H}an, {L}.{Y}., {J}i, {Z}.{L}., {C}hen, {X}., {C}hen, {Y}.{Z}., 2003. {SVM}-{P}rot: web-based support vector machine software for functional classification of a protein from its primary sequence. {N}ucleic {A}cids {R}es. 31(13): 3692-3697]. {H}ere 25 novel viral proteins are selected to test the capability of {SVMP}rot for functional family assignment of viral proteins whose function cannot be confidently predicted on by sequence similarity methods at present. {T}hese proteins are without a sequence homolog in the {S}wissprot database, with its precise function provided in the literature, and not included in the training sets of {SVMP}rot. {T}he predicted functional classes of 72% of these proteins match the literature-described function, which is compared to the overall accuracy of 87% for {SVMP}rot functional class assignment of 34582 proteins. {T}his suggests that {SVMP}rot to some extent is capable of functional class assignment irrespective of sequence similarity and it is potentially useful for facilitating functional study of novel viral proteins.}, doi = {10.1016/j.virol.2004.10.020}, pdf = {../local/Han2005Prediction.pdf}, file = {Han2005Prediction.pdf:local/Han2005Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.virol.2004.10.020} }
@article{Han2004Prediction, author = {Han, L.Y. and Cai, C.Z. and Lo, S.L. and Chung, M.C. and Chen, Y.Z.}, title = {Prediction of {RNA}-binding proteins from primary sequence by a support vector machine approach.}, journal = {R{NA}}, year = {2004}, volume = {10}, pages = {355-368}, number = {3}, abstract = {Elucidation of the interaction of proteins with different molecules is of significance in the understanding of cellular processes. {C}omputational methods have been developed for the prediction of protein-protein interactions. {B}ut insufficient attention has been paid to the prediction of protein-{RNA} interactions, which play central roles in regulating gene expression and certain {RNA}-mediated enzymatic processes. {T}his work explored the use of a machine learning method, support vector machines ({SVM}), for the prediction of {RNA}-binding proteins directly from their primary sequence. {B}ased on the knowledge of known {RNA}-binding and non-{RNA}-binding proteins, an {SVM} system was trained to recognize {RNA}-binding proteins. {A} total of 4011 {RNA}-binding and 9781 non-{RNA}-binding proteins was used to train and test the {SVM} classification system, and an independent set of 447 {RNA}-binding and 4881 non-{RNA}-binding proteins was used to evaluate the classification accuracy. {T}esting results using this independent evaluation set show a prediction accuracy of 94.1%, 79.3%, and 94.1% for r{RNA}-, m{RNA}-, and t{RNA}-binding proteins, and 98.7%, 96.5%, and 99.9% for non-r{RNA}-, non-m{RNA}-, and non-t{RNA}-binding proteins, respectively. {T}he {SVM} classification system was further tested on a small class of sn{RNA}-binding proteins with only 60 available sequences. {T}he prediction accuracy is 40.0% and 99.9% for sn{RNA}-binding and non-sn{RNA}-binding proteins, indicating a need for a sufficient number of proteins to train {SVM}. {T}he {SVM} classification systems trained in this work were added to our {W}eb-based protein functional classification software {SVMP}rot, at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi. {O}ur study suggests the potential of {SVM} as a useful tool for facilitating the prediction of protein-{RNA} interactions.}, pdf = {../local/Han2004Prediction.pdf}, file = {Han2004Prediction.pdf:local/Han2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.rnajournal.org/cgi/content/abstract/10/3/355} }
@article{Han2005Fold, author = {Sangjo Han and Byung-Chul Lee and Seung Taek Yu and Chan-Seok Jeong and Soyoung Lee and Dongsup Kim}, title = {Fold recognition by combining profile-profile alignment and support vector machine.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2667-73}, number = {11}, month = {Jun}, abstract = {M{OTIVATION}: {C}urrently, the most accurate fold-recognition method is to perform profile-profile alignments and estimate the statistical significances of those alignments by calculating {Z}-score or {E}-value. {A}lthough this scheme is reliable in recognizing relatively close homologs related at the family level, it has difficulty in finding the remote homologs that are related at the superfamily or fold level. {RESULTS}: {I}n this paper, we present an alternative method to estimate the significance of the alignments. {T}he alignment between a query protein and a template of length n in the fold library is transformed into a feature vector of length n + 1, which is then evaluated by support vector machine ({SVM}). {T}he output from {SVM} is converted to a posterior probability that a query sequence is related to a template, given {SVM} output. {R}esults show that a new method shows significantly better performance than {PSI}-{BLAST} and profile-profile alignment with {Z}-score scheme. {W}hile {PSI}-{BLAST} and {Z}-score scheme detect 16 and 20\% of superfamily-related proteins, respectively, at 90\% specificity, a new method detects 46\% of these proteins, resulting in more than 2-fold increase in sensitivity. {M}ore significantly, at the fold level, a new method can detect 14\% of remotely related proteins at 90\% specificity, a remarkable result considering the fact that the other methods can detect almost none at the same level of specificity.}, doi = {10.1093/bioinformatics/bti384}, pdf = {../local/Han2005Fold.pdf}, file = {Han2005Fold.pdf:local/Han2005Fold.pdf:PDF}, keywords = {biosvm}, pii = {bti384}, url = {http://dx.doi.org/10.1093/bioinformatics/bti384} }
@techreport{Haussler1999Convolution, author = {Haussler, D.}, title = {Convolution {K}ernels on {D}iscrete {S}tructures}, institution = {UC Santa Cruz}, year = {1999}, number = {UCSC-CRL-99-10}, abstract = {We introduce a new method of constructing kernels on sets whose elements are discrete structures like strings, trees and graphs. {T}he method can be applied iteratively to build a kernel on a infinite set from kernels involving generators of the set. {T}he family of kernels generated generalizes the family of radial basis kernels. {I}t can also be used to define kernels in the form of joint {G}ibbs probability distributions. {K}ernels can be built from hidden {M}arkov random fields, generalized regular expressions, pair-{HMM}s, or {ANOVA} decompositions. {U}ses of the method lead to open problems involving the theory of infinitely divisible positive definite functions. {F}undamentals of this theory and the theory of reproducing kernel {H}ilbert spaces are reviewed and applied in establishing the validity of the method.}, pdf = {../local/Haussler1999Convolution.pdf}, file = {Haussler1999Convolution.pdf:local/Haussler1999Convolution.pdf:PDF}, keywords = {biosvm}, subject = {kernel} }
@article{Helma2004Data, author = {Helma, C. and Cramer, T. and Kramer, S. and De Raedt, L.}, title = {Data mining and machine learning techniques for the identification of mutagenicity inducing substructures and structure activity relationships of noncongeneric compounds}, journal = {J. Chem. Inf. Comput. Sci.}, year = {2004}, volume = {44}, pages = {1402-11}, number = {4}, abstract = {This paper explores the utility of data mining and machine learning algorithms for the induction of mutagenicity structure-activity relationships ({SAR}s) from noncongeneric data sets. {W}e compare (i) a newly developed algorithm ({MOLFEA}) for the generation of descriptors (molecular fragments) for noncongeneric compounds with traditional {SAR} approaches (molecular properties) and (ii) different machine learning algorithms for the induction of {SAR}s from these descriptors. {I}n addition we investigate the optimal parameter settings for these programs and give an exemplary interpretation of the derived models. {T}he predictive accuracies of models using {MOLFEA} derived descriptors is approximately 10-15\%age points higher than those using molecular properties alone. {U}sing both types of descriptors together does not improve the derived models. {F}rom the applied machine learning techniques the rule learner {PART} and support vector machines gave the best results, although the differences between the learning algorithms are only marginal. {W}e were able to achieve predictive accuracies up to 78\% for 10-fold cross-validation. {T}he resulting models are relatively easy to interpret and usable for predictive as well as for explanatory purposes.}, doi = {10.1021/ci034254q}, pdf = {../local/Helma2004Data.pdf}, file = {Helma2004Data.pdf:local/Helma2004Data.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci034254q} }
@incollection{Hochreiter2004Gene, author = {Hochreiter, S. and Obermayer, K.}, title = {Gene selection for microarray data}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {319-355}, pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\}, file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Hofmann2005Concept-based, author = {Oliver Hofmann and Dietmar Schomburg}, title = {Concept-based annotation of enzyme classes.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2059-66}, number = {9}, month = {May}, abstract = {M{OTIVATION}: {G}iven the explosive growth of biomedical data as well as the literature describing results and findings, it is getting increasingly difficult to keep up to date with new information. {K}eeping databases synchronized with current knowledge is a time-consuming and expensive task-one which can be alleviated by automatically gathering findings from the literature using linguistic approaches. {W}e describe a method to automatically annotate enzyme classes with disease-related information extracted from the biomedical literature for inclusion in such a database. {RESULTS}: {E}nzyme names for the 3901 enzyme classes in the {BRENDA} database, a repository for quantitative and qualitative enzyme information, were identified in more than 100,000 abstracts retrieved from the {P}ub{M}ed literature database. {P}hrases in the abstracts were assigned to concepts from the {U}nified {M}edical {L}anguage {S}ystem ({UMLS}) utilizing the {M}eta{M}ap program, allowing for the identification of disease-related concepts by their semantic fields in the {UMLS} ontology. {A}ssignments between enzyme classes and diseases were created based on their co-occurrence within a single sentence. {F}alse positives could be removed by a variety of filters including minimum number of co-occurrences, removal of sentences containing a negation and the classification of sentences based on their semantic fields by a {S}upport {V}ector {M}achine. {V}erification of the assignments with a manually annotated set of 1500 sentences yielded favorable results of 92\% precision at 50\% recall, sufficient for inclusion in a high-quality database. {AVAILABILITY}: {S}ource code is available from the author upon request. {SUPPLEMENTARY} {INFORMATION}: ftp.uni-koeln.de/institute/biochemie/pub/brenda/info/disease{S}upp.pdf.}, doi = {10.1093/bioinformatics/bti284}, pdf = {../local/Hofmann2005Concept-based.pdf}, file = {Hofmann2005Concept-based.pdf:local/Hofmann2005Concept-based.pdf:PDF}, keywords = {biosvm}, pii = {bti284}, url = {http://dx.doi.org/10.1093/bioinformatics/bti284} }
@article{Hou2004Remote, author = {Hou, Y. and Hsu, W. and Lee, M. L. and Bystroff, C.}, title = {Remote homolog detection using local sequence-structure correlations.}, journal = {Proteins}, year = {2004}, volume = {57}, pages = {518-530}, number = {3}, abstract = {Remote homology detection refers to the detection of structural homology in proteins when there is little or no sequence similarity. {I}n this article, we present a remote homolog detection method called {SVM}-{HMMSTR} that overcomes the reliance on detectable sequence similarity by transforming the sequences into strings of hidden {M}arkov states that represent local folding motif patterns. {T}hese state strings are transformed into fixed-dimension feature vectors for input to a support vector machine. {T}wo sets of features are defined: an order-independent feature set that captures the amino acid and local structure composition; and an order-dependent feature set that captures the sequential ordering of the local structures. {T}ests using the {S}tructural {C}lassification of {P}roteins ({SCOP}) 1.53 data set show that the {SVM}-{HMMSTR} gives a significant improvement over several current methods.}, doi = {10.1002/prot.20221}, pdf = {../local/Hou2004Remote.pdf}, file = {Hou2004Remote.pdf:local/Hou2004Remote.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Hou2003Efficient, author = {Hou, Y. and Hsu, W. and Lee, M. L. and Bystroff, C.}, title = {Efficient remote homology detection using local structure}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {2294-2301}, number = {17}, abstract = {Motivation: {T}he function of an unknown biological sequence can often be accurately inferred if we are able to map this unknown sequence to its corresponding homologous family. {A}t present, discriminative methods such as {SVM}-{F}isher and {SVM}-pairwise, which combine support vector machine ({SVM}) and sequence similarity, are recognized as the most accurate methods, with {SVM}-pairwise being the most accurate. {H}owever, these methods typically encode sequence information into their feature vectors and ignore the structure information. {T}hey are also computationally inefficient. {B}ased on these observations, we present an alternative method for {SVM}-based protein classification. {O}ur proposed method, {SVM}-{I}-sites, utilizes structure similarity for remote homology detection. {R}esult: {W}e run experiments on the {S}tructural {C}lassification of {P}roteins 1.53 data set. {T}he results show that {SVM}-{I}-sites is more efficient than {SVM}-pairwise. {F}urther, we find that {SVM}-{I}-sites outperforms sequence-based methods such as {PSI}-{BLAST}, {SAM}, and {SVM}-{F}isher while achieving a comparable performance with {SVM}-pairwise. {A}vailability: {I}-sites server is accessible through the web at http://www.bioinfo.rpi.edu. {P}rograms are available upon request for academics. {L}icensing agreements are available for commercial interests. {T}he framework of encoding local structure into feature vector is available upon request.}, pdf = {../local/Hou2003Efficient.pdf}, file = {Hou2003Efficient.pdf:local/Hou2003Efficient.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/17/2294} }
@article{Hu2004Developing, author = {Hu, C. and Li, X. and Liang, J.}, title = {Developing optimal non-linear scoring function for protein design}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {3080-3098}, number = {17}, abstract = {Motivation. {P}rotein design aims to identify sequences compatible with a given protein fold but incompatible to any alternative folds. {T}o select the correct sequences and to guide the search process, a design scoring function is critically important. {S}uch a scoring function should be able to characterize the global fitness landscape of many proteins simultaneously. {R}esults: {T}o find optimal design scoring functions, we introduce two geometric views and propose a formulation using a mixture of non-linear {G}aussian kernel functions. {W}e aim to solve a simplified protein sequence design problem. {O}ur goal is to distinguish each native sequence for a major portion of representative protein structures from a large number of alternative decoy sequences, each a fragment from proteins of different folds. {O}ur scoring function discriminates perfectly a set of 440 native proteins from 14 million sequence decoys. {W}e show that no linear scoring function can succeed in this task. {I}n a blind test of unrelated proteins, our scoring function misclassfies only 13 native proteins out of 194. {T}his compares favorably with about three-four times more misclassifications when optimal linear functions reported in the literature are used. {W}e also discuss how to develop protein folding scoring function. {A}vailability: {A}vailable on request from the authors.}, doi = {10.1093/bioinformatics/bth369}, pdf = {../local/Hu2004Developing}, file = {Hu2004Developing:local/Hu2004Developing:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3080} }
@article{Hu2004Improved, author = {Hu, H.J. and Pan, Y. and Harrison, R. and Tai, P.C.}, title = {Improved protein secondary structure prediction using support vector machine with a new encoding scheme and an advanced tertiary classifier}, journal = {I{EEE} {T}rans. {N}anobioscience}, year = {2004}, volume = {3}, pages = {265-271}, number = {4}, abstract = {Prediction of protein secondary structures is an important problem in bioinformatics and has many applications. {T}he recent trend of secondary structure prediction studies is mostly based on the neural network or the support vector machine ({SVM}). {T}he {SVM} method is a comparatively new learning system which has mostly been used in pattern recognition problems. {I}n this study, {SVM} is used as a machine learning tool for the prediction of secondary structure and several encoding schemes, including orthogonal matrix, hydrophobicity matrix, {BLOSUM}62 substitution matrix, and combined matrix of these, are applied and optimized to improve the prediction accuracy. {A}lso, the optimal window length for six {SVM} binary classifiers is established by testing different window sizes and our new encoding scheme is tested based on this optimal window size via sevenfold cross validation tests. {T}he results show 2% increase in the accuracy of the binary classifiers when compared with the instances in which the classical orthogonal matrix is used. {F}inally, to combine the results of the six {SVM} binary classifiers, a new tertiary classifier which combines the results of one-versus-one binary classifiers is introduced and the performance is compared with those of existing tertiary classifiers. {A}ccording to the results, the {Q}3 prediction accuracy of new tertiary classifier reaches 78.8% and this is better than the best result reported in the literature.}, pdf = {../local/Hu2004Improved.pdf}, file = {Hu2004Improved.pdf:local/Hu2004Improved.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Hua2005Optimal, author = {Hua, J. and Xiong, Z. and Lowey, J. and Suh, E. and Dougherty, E. R.}, title = {Optimal number of features as a function of sample size for various classification rules}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1509-1515}, number = {8}, month = {Apr}, note = {To appear}, abstract = {Motivation: {G}iven the joint feature-label distribution, increasing the number of features always results in decreased classification error; however, this is not the case when a classifier is designed via a classification rule from sample data. {T}ypically (but not always), for fixed sample size, the error of a designed classifier decreases and then increases as the number of features grows. {T}he potential downside of using too many features is most critical for small samples, which are commonplace for gene-expression-based classifiers for phenotype discrimination. {F}or fixed sample size and feature-label distribution, the issue is to find an optimal number of features.{R}esults: {S}ince only in rare cases is there a known distribution of the error as a function of the number of features and sample size, this study employs simulation for various feature-label distributions and classification rules, and across a wide range of sample and feature-set sizes. {T}o achieve the desired end, finding the optimal number of features as a function of sample size, it employs massively parallel computation. {S}even classifiers are treated: 3-nearest-neighbor, {G}aussian kernel, linear support vector machine, polynomial support vector machine, perceptron, regular histogram and linear discriminant analysis. {T}hree {G}aussian-based models are considered: linear, nonlinear and bimodal. {I}n addition, real patient data from a large breast-cancer study is considered. {T}o mitigate the combinatorial search for finding optimal feature sets, and to model the situation in which subsets of genes are co-regulated and correlation is internal to these subsets, we assume that the covariance matrix of the features is blocked, with each block corresponding to a group of correlated features. {A}ltogether there is a large number of error surfaces for the many cases. {T}hese are provided in full on a companion web-site, which is meant to serve as resource for those working with small-sample classification.{A}vailability: {F}or the companion web-site, please visit http://public.tgen.org/tamu/ofs/.}, doi = {10.1093/bioinformatics/bti171}, pdf = {../local/Hua2005Optimal.pdf}, file = {Hua2005Optimal.pdf:local/Hua2005Optimal.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti171v1} }
@article{Hua2001Novel, author = {Hua, S. and Sun, Z.}, title = {A {N}ovel {M}ethod of {P}rotein {S}econdary {S}tructure {P}rediction with {H}igh {S}egment {O}verlap {M}easure: {S}upport {V}ector {M}achine {A}pproach}, journal = {J. {M}ol. {B}iol.}, year = {2001}, volume = {308}, pages = {397--407}, number = {2}, month = {April}, doi = {10.1006/jmbi.2001.4580}, pdf = {../local/Hua2001Novel.pdf}, file = {Hua2001Novel.pdf:local/Hua2001Novel.pdf:PDF}, keywords = {biosvm}, subject = {biokernel} }
@article{Hua2001Support, author = {Hua, S. and Sun, Z.}, title = {Support vector machine approach for protein subcellular localization prediction}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {721-728}, number = {8}, abstract = {Motivation: {S}ubcellular localization is a key functional characteristic of proteins. {A} fully automatic and reliable prediction system for protein subcellular localization is needed, especially for the analysis of large-scale genome sequences. {R}esults: {I}n this paper, {S}upport {V}ector {M}achine has been introduced to predict the subcellular localization of proteins from their amino acid compositions. {T}he total prediction accuracies reach 91.4% for three subcellular locations in prokaryotic organisms and 79.4% for four locations in eukaryotic organisms. {P}redictions by our approach are robust to errors in the protein {N}-terminal sequences. {T}his new approach provides superior prediction performance compared with existing algorithms based on amino acid composition and can be a complementary method to other existing methods based on sorting signals. {A}vailability: {A} web server implementing the prediction method is available at http://www.bioinfo.tsinghua.edu.cn/{S}ub{L}oc/. {C}ontact: sunzhr@mail.tsinghua.edu.cn; huasj00@mails.tsinghua.edu.cn {S}upplementary information: {S}upplementary material is available at http://www.bioinfo.tsinghua.edu.cn/{S}ub{L}oc}, pdf = {../local/Hua2001Support.pdf}, file = {Hua2001Support.pdf:local/Hua2001Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/8/721} }
@inproceedings{Huan2004Accurate, author = {Huan, J. and Wang, W. and Washington, A. and Prins, J. and Shah, R. and Tropsha, A.}, title = {Accurate classification of protein structural families using coherent subgraph analysis.}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002}, year = {2004}, pages = {411-422}, abstract = {Protein structural annotation and classification is an important problem in bioinformatics. {W}e report on the development of an efficient subgraph mining technique and its application to finding characteristic substructural patterns within protein structural families. {I}n our method, protein structures are represented by graphs where the nodes are residues and the edges connect residues found within certain distance from each other. {A}pplication of subgraph mining to proteins is challenging for a number reasons: (1) protein graphs are large and complex, (2) current protein databases are large and continue to grow rapidly, and (3) only a small fraction of the frequent subgraphs among the huge pool of all possible subgraphs could be significant in the context of protein classification. {T}o address these challenges, we have developed an information theoretic model called coherent subgraph mining. {F}rom information theory, the entropy of a random variable {X} measures the information content carried by {X} and the {M}utual {I}nformation ({MI}) between two random variables {X} and {Y} measures the correlation between {X} and {Y}. {W}e define a subgraph {X} as coherent if it is strongly correlated with every sufficiently large sub-subgraph {Y} embedded in it. {B}ased on the {MI} metric, we have designed a search scheme that only reports coherent subgraphs. {T}o determine the significance of coherent protein subgraphs, we have conducted an experimental study in which all coherent subgraphs were identified in several protein structural families annotated in the {SCOP} database ({M}urzin et al, 1995). {T}he {S}upport {V}ector {M}achine algorithm was used to classify proteins from different families under the binary classification scheme. {W}e find that this approach identifies spatial motifs unique to individual {SCOP} families and affords excellent discrimination between families.}, pdf = {../local/Huan2004Accurate.pdf}, file = {Huan2004Accurate.pdf:local/Huan2004Accurate.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Huang2005Support, author = {Jing Huang and Feng Shi}, title = {Support vector machines for predicting apoptosis proteins types.}, journal = {Acta {B}iotheor.}, year = {2005}, volume = {53}, pages = {39-47}, number = {1}, abstract = {Apoptosis proteins have a central role in the development and homeostasis of an organism. {T}hese proteins are very important for understanding the mechanism of programmed cell death, and their function is related to their types. {A}ccording to the classification scheme by {Z}hou and {D}octor (2003), the apoptosis proteins are categorized into the following four types: (1) cytoplasmic protein; (2) plasma membrane-bound protein; (3) mitochondrial inner and outer proteins; (4) other proteins. {A} powerful learning machine, the {S}upport {V}ector {M}achine, is applied for predicting the type of a given apoptosis protein by incorporating the sqrt-amino acid composition effect. {H}igh success rates were obtained by the re-substitute test (98/98 = 100 \%) and the jackknife test (89/98 = 90.8\%).}, doi = {10.1007/s10441-005-7002-5}, pdf = {../local/Huang2005Support.pdf}, file = {Huang2005Support.pdf:local/Huang2005Support.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1007/s10441-005-7002-5} }
@article{Huang2005CTKPred, author = {Huang, N. and Chen, H. and Sun, Z.}, title = {C{TKP}red: an {SVM}-based method for the prediction and classification of the cytokine superfamily.}, journal = {Protein {E}ng. {D}es. {S}el.}, year = {2005}, month = {Jun}, abstract = {Cell proliferation, differentiation and death are controlled by a multitude of cell-cell signals and loss of this control has devastating consequences. {P}rominent among these regulatory signals is the cytokine superfamily, which has crucial functions in the development, differentiation and regulation of immune cells. {I}n this study, a support vector machine ({SVM})-based method was developed for predicting families and subfamilies of cytokines using dipeptide composition. {T}he taxonomy of the cytokine superfamily with which our method complies was described in the {C}ytokine {F}amily c{DNA} {D}atabase (db{CFC}) and the dataset used in this study for training and testing was obtained from the db{CFC} and {S}tructural {C}lassification of {P}roteins ({SCOP}). {T}he method classified cytokines and non-cytokines with an accuracy of 92.5\% by 7-fold cross-validation. {T}he method is further able to predict seven major classes of cytokine with an overall accuracy of 94.7\%. {A} server for recognition and classification of cytokines based on multi-class {SVM}s has been set up at http://bioinfo.tsinghua.edu.cn/~huangni/{CTKP}red/.}, doi = {10.1093/protein/gzi041}, pdf = {../local/Huang2005CTKPred.pdf}, file = {Huang2005CTKPred.pdf:local/Huang2005CTKPred.pdf:PDF}, keywords = {biosvm}, pii = {gzi041}, url = {http://dx.doi.org/10.1093/protein/gzi041} }
@article{Huang2005Computation, author = {Shao-Wei Huang and Jenn-Kang Hwang}, title = {Computation of conformational entropy from protein sequences using the machine-learning method--application to the study of the relationship between structural conservation and local structural stability.}, journal = {Proteins}, year = {2005}, volume = {59}, pages = {802-9}, number = {4}, month = {Jun}, abstract = {A complete protein sequence can usually determine a unique conformation; however, the situation is different for shorter subsequences--some of them are able to adopt unique conformations, independent of context; while others assume diverse conformations in different contexts. {T}he conformations of subsequences are determined by the interplay between local and nonlocal interactions. {A} quantitative measure of such structural conservation or variability will be useful in the understanding of the sequence-structure relationship. {I}n this report, we developed an approach using the support vector machine method to compute the conformational variability directly from sequences, which is referred to as the sequence structural entropy. {A}s a practical application, we studied the relationship between sequence structural entropy and the hydrogen exchange for a set of well-studied proteins. {W}e found that the slowest exchange cores usually comprise amino acids of the lowest sequence structural entropy. {O}ur results indicate that structural conservation is closely related to the local structural stability. {T}his relationship may have interesting implications in the protein folding processes, and may be useful in the study of the sequence-structure relationship.}, doi = {10.1002/prot.20462}, pdf = {../local/Huang2005Computation.pdf}, file = {Huang2005Computation.pdf:local/Huang2005Computation.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1002/prot.20462} }
@article{Huang2005Gene, author = {Huang, T. M. and Kecman, V.}, title = {Gene extraction for cancer diagnosis by support vector machines-{A}n improvement.}, journal = {Artif. {I}ntell. {M}ed.}, year = {2005}, month = {Jul}, abstract = {O{BJECTIVE}:: {T}o improve the performance of gene extraction for cancer diagnosis by recursive feature elimination with support vector machines ({RFE}-{SVM}s): {A} cancer diagnosis by using the {DNA} microarray data faces many challenges the most serious one being the presence of thousands of genes and only several dozens (at the best) of patient's samples. {T}hus, making any kind of classification in high-dimensional spaces from a limited number of data is both an extremely difficult and a prone to an error procedure. {T}he improved {RFE}-{SVM}s is introduced and used here for an elimination of less relevant genes and just for a reduction of the overall number of genes used in a medical diagnostic. {METHODS}:: {T}he paper shows why and how the, usually neglected, penalty parameter {C} and some standard data preprocessing techniques (normalizing and scaling) influence classification results and the gene selection of {RFE}-{SVM}s. {T}he gene selected by {RFE}-{SVM}s is compared with eight other gene selection algorithms implemented in the {R}ankgene software to investigate whether there is any consensus among the algorithms, so the scope of finding the right set of genes can be reduced. {RESULTS}:: {T}he improved {RFE}-{SVM}s is applied on the two benchmarking colon and lymphoma cancer data sets with various {C} parameters and different standard preprocessing techniques. {H}ere, decreasing {C} leads to the smaller diagnosis error in comparisons to other known methods applied to the benchmarking data sets. {W}ith an appropriate parameter {C} and with a proper preprocessing procedure, the reduction in a diagnosis error is as high as 36\%. {CONCLUSIONS}:: {T}he results suggest that with a properly chosen parameter {C}, the extracted genes and the constructed classifier will ensure less overfitting of the training data leading to an increased accuracy in selecting relevant genes. {F}inally, comparison in gene ranking obtained by different algorithms shows that there is a significant consensus among the various algorithms as to which set of genes is relevant.}, doi = {10.1016/j.artmed.2005.01.006}, pdf = {../local/Huang2005Gene.pdf}, file = {Huang2005Gene.pdf:local/Huang2005Gene.pdf:PDF}, keywords = {biosvm}, pii = {S0933-3657(05)00051-5}, url = {http://dx.doi.org/10.1016/j.artmed.2005.01.006} }
@article{Hutter2004Prediction, author = {Hutter, B. and Schaab, C. and Albrecht, S. and Borgmann, M. and Brunner, N. A. and Freiberg, C. and Ziegelbauer, K. and Rock, C. O. and Ivanov, I. and Loferer, H.}, title = {Prediction of {M}echanisms of {A}ction of {A}ntibacterial {C}ompounds by {G}ene {E}xpression {P}rofiling}, journal = {Antimicrob. {A}gents {C}hemother.}, year = {2004}, volume = {48}, pages = {2838-2844}, number = {8}, month = {Aug}, abstract = {We have generated a database of expression profiles carrying the transcriptional responses of the model organism {B}acillus subtilis following treatment with 37 well-characterized antibacterial compounds of different classes. {T}he database was used to build a predictor for the assignment of the mechanisms of action ({M}o{A}s) of antibacterial compounds by the use of support vector machines. {T}his predictor was able to correctly classify the {M}o{A} class for most compounds tested. {F}urthermore, we provide evidence that the in vivo {M}o{A} of hexachlorophene does not match the {M}o{A} predicted from in vitro data, a situation frequently faced in drug discovery. {A} database of this kind may facilitate the prioritization of novel antibacterial entities in drug discovery programs. {P}otential applications and limitations are discussed.}, doi = {10.1128/AAC.48.8.2838-2844.2004}, eprint = {http://aac.asm.org/cgi/reprint/48/8/2838.pdf}, pdf = {../local/Hutter2004Prediction.pdf}, file = {Hutter2004Prediction.pdf:local/Hutter2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1128/AAC.48.8.2838-2844.2004} }
@article{Jaakkola2000Discriminative, author = {Jaakkola, T. and Diekhans, M. and Haussler, D.}, title = {A {D}iscriminative {F}ramework for {D}etecting {R}emote {P}rotein {H}omologies}, journal = {J. {C}omput. {B}iol.}, year = {2000}, volume = {7}, pages = {95--114}, number = {1,2}, pdf = {../local/jaak00.pdf}, file = {jaak00.pdf:local/jaak00.pdf:PDF}, keywords = {biosvm}, subject = {biokernelcasp}, url = {http://www.cse.ucsc.edu/research/compbio/discriminative/Jaakola2-1998.ps} }
@inproceedings{Jaakkola1999Using, author = {Jaakkola, T. S. and Diekhans, M. and Haussler, D.}, title = {Using the {F}isher {K}ernel {M}ethod to {D}etect {R}emote {P}rotein {H}omologies}, booktitle = {Proceedings of the {S}eventh {I}nternational {C}onference on {I}ntelligent {S}ystems for {M}olecular {B}iology}, year = {1999}, pages = {149--158}, publisher = {AAAI Press}, keywords = {biosvm}, owner = {jeanphilippevert} }
@inproceedings{Jaakkola1999Exploiting, author = {Jaakkola, T. S. and Haussler, D.}, title = {Exploiting generative models in discriminative classifiers}, booktitle = {Proc. of {T}enth {C}onference on {A}dvances in {N}eural {I}nformation {P}rocessing {S}ystems}, year = {1999}, pdf = {../local/jaak99.pdf}, file = {jaak99.pdf:local/jaak99.pdf:PDF}, keywords = {biosvm}, subject = {kernel}, url = {http://www.cse.ucsc.edu/research/ml/papers/Jaakola.ps} }
@article{Jarzab2005Gene, author = {Barbara Jarzab and Malgorzata Wiench and Krzysztof Fujarewicz and Krzysztof Simek and Michal Jarzab and Malgorzata Oczko-Wojciechowska and Jan Wloch and Agnieszka Czarniecka and Ewa Chmielik and Dariusz Lange and Agnieszka Pawlaczek and Sylwia Szpak and Elzbieta Gubala and Andrzej Swierniak}, title = {Gene expression profile of papillary thyroid cancer: sources of variability and diagnostic implications.}, journal = {Cancer {R}es.}, year = {2005}, volume = {65}, pages = {1587-97}, number = {4}, month = {Feb}, abstract = {The study looked for an optimal set of genes differentiating between papillary thyroid cancer ({PTC}) and normal thyroid tissue and assessed the sources of variability in gene expression profiles. {T}he analysis was done by oligonucleotide microarrays ({G}ene{C}hip {HG}-{U}133{A}) in 50 tissue samples taken intraoperatively from 33 patients (23 {PTC} patients and 10 patients with other thyroid disease). {I}n the initial group of 16 {PTC} and 16 normal samples, we assessed the sources of variability in the gene expression profile by singular value decomposition which specified three major patterns of variability. {T}he first and the most distinct mode grouped transcripts differentiating between tumor and normal tissues. {T}wo consecutive modes contained a large proportion of immunity-related genes. {T}o generate a multigene classifier for tumor-normal difference, we used support vector machines-based technique (recursive feature replacement). {I}t included the following 19 genes: {DPP}4, {GJB}3, {ST}14, {SERPINA}1, {LRP}4, {MET}, {EVA}1, {SPUVE}, {LGALS}3, {HBB}, {MKRN}2, {MRC}2, {IGSF}1, {KIAA}0830, {RXRG}, {P}4{HA}2, {CDH}3, {IL}13{RA}1, and {MTMR}4, and correctly discriminated 17 of 18 additional {PTC}/normal thyroid samples and all 16 samples published in a previous microarray study. {S}elected novel genes ({LRP}4, {EVA}1, {TMPRSS}4, {QPCT}, and {SLC}34{A}2) were confirmed by {Q}-{PCR}.{O}ur results prove that the gene expression signal of {PTC} is easily detectable even when cancer cells do not prevail over tumor stroma. {W}e indicate and separate the confounding variability related to the immune response. {F}inally, we propose a potent molecular classifier able to discriminate between {PTC} and nonmalignant thyroid in more than 90\% of investigated samples.}, doi = {10.1158/0008-5472.CAN-04-3078}, pdf = {../local/Jarzab2005Gene.pdf}, file = {Jarzab2005Gene.pdf:local/Jarzab2005Gene.pdf:PDF}, keywords = {biosvm}, pii = {65/4/1587}, url = {http://dx.doi.org/10.1158/0008-5472.CAN-04-3078} }
@article{Jiang-Ning2004Cooperativity, author = {Jiang-Ning, S. and Wei-Jiang, L. and Wen-Bo, X.}, title = {Cooperativity of the oxidization of cysteines in globular proteins.}, journal = {J. {T}heor. {B}iol.}, year = {2004}, volume = {231}, pages = {85-95}, number = {1}, abstract = {Based on the 639 non-homologous proteins with 2910 cysteine-containing segments of well-resolved three-dimensional structures, a novel approach has been proposed to predict the disulfide-bonding state of cysteines in proteins by constructing a two-stage classifier combining a first global linear discriminator based on their amino acid composition and a second local support vector machine classifier. {T}he overall prediction accuracy of this hybrid classifier for the disulfide-bonding state of cysteines in proteins has scored 84.1% and 80.1%, when measured on cysteine and protein basis using the rigorous jack-knife procedure, respectively. {I}t shows that whether cysteines should form disulfide bonds depends not only on the global structural features of proteins but also on the local sequence environment of proteins. {T}he result demonstrates the applicability of this novel method and provides comparable prediction performance compared with existing methods for the prediction of the oxidation states of cysteines in proteins.}, doi = {10.1016/j.jtbi.2004.06.002}, pdf = {../local/Jiang-Ning2004Cooperativity.pdf}, file = {Jiang-Ning2004Cooperativity.pdf:local/Jiang-Ning2004Cooperativity.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.jtbi.2004.06.002} }
@article{Jorissen2005Virtual, author = {R. N. Jorissen and M. K. Gilson}, title = {Virtual screening of molecular databases using a support vector machine.}, journal = {J {C}hem {I}nf {M}odel}, year = {2005}, volume = {45}, pages = {549-61}, number = {3}, abstract = {The {S}upport {V}ector {M}achine ({SVM}) is an algorithm that derives a model used for the classification of data into two categories and which has good generalization properties. {T}his study applies the {SVM} algorithm to the problem of virtual screening for molecules with a desired activity. {I}n contrast to typical applications of the {SVM}, we emphasize not classification but enrichment of actives by using a modified version of the standard {SVM} function to rank molecules. {T}he method employs a simple and novel criterion for picking molecular descriptors and uses cross-validation to select {SVM} parameters. {T}he resulting method is more effective at enriching for active compounds with novel chemistries than binary fingerprint-based methods such as binary kernel discrimination.}, doi = {10.1021/ci049641u}, pdf = {../local/Jorissen2005Virtual.pdf}, file = {Jorissen2005Virtual.pdf:local/Jorissen2005Virtual.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049641u} }
@article{Karchin2002Classifying, author = {Karchin, R. and Karplus, K. and Haussler, D.}, title = {Classifying {G}-protein coupled receptors with support vector machines}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {147--159}, abstract = {Motivation: {T}he enormous amount of protein sequence data uncovered by genome research has increased the demand for computer software that can automate the recognition of new proteins. {W}e discuss the relative merits of various automated methods for recognizing {G}-{P}rotein {C}oupled {R}eceptors ({GPCR}s), a superfamily of cell membrane proteins. {GPCR}s are found in a wide range of organisms and are central to a cellular signalling network that regulates many basic physiological processes. {T}hey are the focus of a significant amount of current pharmaceutical research because they play a key role in many diseases. {H}owever, their tertiary structures remain largely unsolved. {T}he methods described in this paper use only primary sequence information to make their predictions. {W}e compare a simple nearest neighbor approach ({BLAST}), methods based on multiple alignments generated by a statistical profile {H}idden {M}arkov {M}odel ({HMM}), and methods, including {S}upport {V}ector {M}achines ({SVM}s), that transform protein sequences into fixed-length feature vectors. {R}esults: {T}he last is the most computationally expensive method, but our experiments show that, for those interested in annotation-quality classification, the results are worth the effort. {I}n two-fold cross-validation experiments testing recognition of {GPCR} subfamilies that bind a specific ligand (such as a histamine molecule), the errors per sequence at the {M}inimum {E}rror {P}oint ({MEP}) were 13.7% for multi-class {SVM}s, 17.1% for our {SVM}tree method of hierarchical multi-class {SVM} classification, 25.5% for {BLAST}, 30% for profile {HMM}s, and 49% for classification based on nearest neighbor feature vector {K}ernel {N}earest {N}eighbor (kern{NN}). {T}he percentage of true positives recognized before the first false positive was 65% for both {SVM} methods, 13% for {BLAST}, 5% for profile {HMM}s and 4% for kern{NN}. {A}vailability: {W}e have set up a web server for {GPCR} subfamily classification based on hierarchical multi-class {SVM}s at http://www.soe.ucsc.edu/research/compbio/gpcr-subclass. {B}y scanning predicted peptides found in the human genome with the {SVM}tree server, we have identified a large number of genes that encode {GPCR}s. {A} list of our predictions for human {GPCR}s is available at http://www.soe.ucsc.edu/research/compbio/gpcr·hg/class·results. {W}e also provide suggested subfamily classification for 18 sequences previously identified as unclassified {C}lass {A} (rhodopsin-like) {GPCR}s in {GPCRDB} ({H}orn et al. , {N}ucleic {A}cids {R}es. , 26, 277?281, 1998), available at http://www.soe.ucsc.edu/research/compbio/gpcr/class{A}·unclassified/}, comment = {Un papier intéressant sur l'utilisation du Fisher kernel pour classer les GPCR, une famille de protéines importante pour l'industrie pharmaceutique.}, pdf = {../local/Karchin2002Classifying.pdf}, file = {Karchin2002Classifying.pdf:local/Karchin2002Classifying.pdf:PDF}, keywords = {fisher-kernel sequence-classification biosvm}, subject = {biokernel}, url = {http://bioinformatics.oupjournals.org/cgi/reprint/18/1/147} }
@article{Karchin2005Improving, author = {R. Karchin and L. Kelly and A. Sali}, title = {Improving functional annotation of non-synonomous {SNP}s with information theory.}, journal = {Pac {S}ymp {B}iocomput}, year = {2005}, pages = {397-408}, abstract = {Automated functional annotation of ns{SNP}s requires that amino-acid residue changes are represented by a set of descriptive features, such as evolutionary conservation, side-chain volume change, effect on ligand-binding, and residue structural rigidity. {I}dentifying the most informative combinations of features is critical to the success of a computational prediction method. {W}e rank 32 features according to their mutual information with functional effects of amino-acid substitutions, as measured by in vivo assays. {I}n addition, we use a greedy algorithm to identify a subset of highly informative features. {T}he method is simple to implement and provides a quantitative measure for selecting the best predictive features given a set of features that a human expert believes to be informative. {W}e demonstrate the usefulness of the selected highly informative features by cross-validated tests of a computational classifier, a support vector machine ({SVM}). {T}he {SVM}'s classification accuracy is highly correlated with the ranking of the input features by their mutual information. {T}wo features describing the solvent accessibility of "wild-type" and "mutant" amino-acid residues and one evolutionary feature based on superfamily-level multiple alignments produce comparable overall accuracy and 6\% fewer false positives than a 32-feature set that considers physiochemical properties of amino acids, protein electrostatics, amino-acid residue flexibility, and binding interactions.}, keywords = {biosvm} }
@article{Karklin2005Classification, author = {Karklin, Y. and Meraz, R. F. and Holbrook, S.R.}, title = {Classification of non-coding {RNA} using graph representations of secondary structure.}, journal = {Pac. {S}ymp. {B}iocomput.}, year = {2005}, pages = {4-15}, abstract = {Some genes produce transcripts that function directly in regulatory, catalytic, or structural roles in the cell. {T}hese non-coding {RNA}s are prevalent in all living organisms, and methods that aid the understanding of their functional roles are essential. {RNA} secondary structure, the pattern of base-pairing, contains the critical information for determining the three dimensional structure and function of the molecule. {I}n this work we examine whether the basic geometric and topological properties of secondary structure are sufficient to distinguish between {RNA} families in a learning framework. {F}irst, we develop a labeled dual graph representation of {RNA} secondary structure by adding biologically meaningful labels to the dual graphs proposed by {G}an et al [1]. {N}ext, we define a similarity measure directly on the labeled dual graphs using the recently developed marginalized kernels [2]. {U}sing this similarity measure, we were able to train {S}upport {V}ector {M}achine classifiers to distinguish {RNA}s of known families from random {RNA}s with similar statistics. {F}or 22 of the 25 families tested, the classifier achieved better than 70\% accuracy, with much higher accuracy rates for some families. {T}raining a set of classifiers to automatically assign family labels to {RNA}s using a one vs. all multi-class scheme also yielded encouraging results. {F}rom these initial learning experiments, we suggest that the labeled dual graph representation, together with kernel machine methods, has potential for use in automated analysis and classification of uncharacterized {RNA} molecules or efficient genome-wide screens for {RNA} molecules from existing families.}, keywords = {biosvm}, url = {http://helix-web.stanford.edu/psb05/karklin.pdf} }
@inproceedings{Kashima2003Marginalized, author = {Kashima, H. and Tsuda, K. and Inokuchi, A.}, title = {Marginalized {K}ernels between {L}abeled {G}raphs}, booktitle = {Proceedings of the {T}wentieth {I}nternational {C}onference on {M}achine {L}earning}, year = {2003}, editor = {Faucett, T. and Mishra, N.}, pages = {321-328}, address = {New York, NY, USA}, publisher = {AAAI Press}, pdf = {../local/Kashima2003Marginalized.pdf}, file = {Kashima2003Marginalized.pdf:local/Kashima2003Marginalized.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@incollection{Kashima2004Kernels, author = {Kashima, H. and Tsuda, K. and Inokuchi, A.}, title = {Kernels for graphs}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {155-170}, address = {The MIT Press, Cambridge, Massachussetts}, keywords = {biosvm chemoinformatics}, owner = {vert} }
@article{Kim2004Predictiona, author = {Kim, H. and Park, H.}, title = {Prediction of protein relative solvent accessibility with support vector machines and long-range interaction 3{D} local descriptor}, journal = {Proteins}, year = {2004}, volume = {54}, pages = {557-562}, number = {3}, month = {Feb}, abstract = {The prediction of protein relative solvent accessibility gives us helpful information for the prediction of tertiary structure of a protein. {T}he {SVM}psi method, which uses support vector machines ({SVM}s), and the position-specific scoring matrix ({PSSM}) generated from {PSI}-{BLAST} have been applied to achieve better prediction accuracy of the relative solvent accessibility. {W}e have introduced a three-dimensional local descriptor that contains information about the expected remote contacts by both the long-range interaction matrix and neighbor sequences. {M}oreover, we applied feature weights to kernels in {SVM}s in order to consider the degree of significance that depends on the distance from the specific amino acid. {R}elative solvent accessibility based on a two state-model, for 25%, 16%, 5%, and 0% accessibility are predicted at 78.7%, 80.7%, 82.4%, and 87.4% accuracy, respectively. {T}hree-state prediction results provide a 64.5% accuracy with 9%; 36% threshold. {T}he support vector machine approach has successfully been applied for solvent accessibility prediction by considering long-range interaction and handling unbalanced data.}, doi = {10.1002/prot.10602}, pdf = {../local/Kim2004Predictiona.pdf}, file = {Kim2004Predictiona.pdf:local/Kim2004Predictiona.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.10602} }
@article{Kim2003Protein, author = {Kim, H. and Park, H.}, title = {Protein secondary structure prediction based on an improved support vector machines approach}, journal = {Protein {E}ng.}, year = {2003}, volume = {16}, pages = {553-560}, number = {8}, month = {Aug}, abstract = {The prediction of protein secondary structure is an important step in the prediction of protein tertiary structure. {A} new protein secondary structure prediction method, {SVM}psi, was developed to improve the current level of prediction by incorporating new tertiary classifiers and their jury decision system, and the {PSI}-{BLAST} {PSSM} profiles. {A}dditionally, efficient methods to handle unbalanced data and a new optimization strategy for maximizing the {Q}3 measure were developed. {T}he {SVM}psi produces the highest published {Q}3 and {SOV}94 scores on both the {RS}126 and {CB}513 data sets to date. {F}or a new {KP}480 set, the prediction accuracy of {SVM}psi was {Q}3 = 78.5% and {SOV}94 = 82.8%. {M}oreover, the blind test results for 136 non-redundant protein sequences which do not contain homologues of training data sets were {Q}3 = 77.2% and {SOV}94 = 81.8%. {T}he {SVM}psi results in {CASP}5 illustrate that it is another competitive method to predict protein secondary structure.}, pdf = {../local/Kim2003Protein.pdf}, file = {Kim2003Protein.pdf:local/Kim2003Protein.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://peds.oupjournals.org/cgi/content/abstract/16/8/553} }
@article{Kim2004Prediction, author = {Kim, J. H. and Lee, J. and Oh, B. and Kimm, K. and Koh, I.}, title = {Prediction of phosphorylation sites using {SVM}s}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {3179-3184}, number = {17}, abstract = {Motivation: {P}hosphorylation is involved in diverse signal transduction pathways. {B}y predicting phosphorylation sites and their kinases from primary protein sequences, we can obtain much valuable information that can form the basis for further research. {U}sing support vector machines, we attempted to predict phosphorylation sites and the type of kinase that acts at each site. {R}esults: {O}ur prediction system was limited to phosphorylation sites catalyzed by four protein kinase families and four protein kinase groups. {T}he accuracy of the predictions ranged from 83 to 95% at the kinase family level, and 76-91% at the kinase group level. {T}he prediction system used--{P}red{P}hospho--can be applied to the functional study of proteins, and can help predict the changes in phosphorylation sites caused by amino acid variations at intra- and interspecies levels. {A}vailability: {P}red{P}hospho is available at http://www.ngri.re.kr/proteo/{P}red{P}hospho.htm. {S}upplementary information: http://www.ngri.re.kr/proteo/supplementary.doc}, doi = {10.1093/bioinformatics/bth382}, pdf = {../local/Kim2004Prediction.pdf}, file = {Kim2004Prediction.pdf:local/Kim2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3179} }
@inproceedings{Kin2002Marginalized, author = {Kin, T. and Tsuda, K. and Asai, K.}, title = {Marginalized kernels for {RNA} sequence data analysis}, booktitle = {Genome {I}nformatics 2002}, year = {2002}, editor = {Lathtop, R.H. and Nakai, K. and Miyano, S. and Takagi, T. and Kanehisa, M.}, pages = {112-122}, publisher = {Universal Academic Press}, abstract = {We present novel kernels that measure similarity of two {RNA} sequences, taking account of their secondary structures. {T}wo types of kernels are presented. {O}ne is for {RNA} sequences with known secondary structures, the other for those without known secondary structures. {T}he latter employs stochastic context-free grammar ({SCFG}) for estimating the secondary structure. {W}e call the latter the {\it marginalized count kernel} ({MCK}). {W}e show computational experiments for {MCK} using 74 sets of human t{RNA} sequence data: (i) kernel principal component analysis ({PCA}) for visualizing t{RNA} similarities, (ii) supervised classification with support vector machines ({SVM}s). {B}oth types of experiment show promising results for {MCK}s.}, pdf = {../local/Kin2002Marginalized.pdf}, file = {Kin2002Marginalized.pdf:local/Kin2002Marginalized.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.jsbi.org/journal/GIW02/GIW02F012.html} }
@article{Kohlmann2004Pediatric, author = {Kohlmann, A. and Schoch, C. and Schnittger, S. and Dugas, M. and Hiddemann, W. and Kern, W. and Haferlach, T.}, title = {Pediatric acute lymphoblastic leukemia ({ALL}) gene expression signatures classify an independent cohort of adult {ALL} patients}, journal = {Leukemia}, year = {2004}, volume = {18}, pages = {63-71}, number = {1}, abstract = {Recent reports support a possible future application of gene expression profiling for the diagnosis of leukemias. {H}owever, the robustness of subtype-specific gene expression signatures has to be proven on independent patient samples. {H}ere, we present gene expression data of 34 adult acute lymphoblastic leukemia ({ALL}) patients ({A}ffymetrix {U}133{A} microarrays). {S}upport {V}ector {M}achines ({SVM}s) were applied to stratify our samples based on given gene lists reported to predict {MLL}, {BCR}-{ABL}, and {T}-{ALL}, as well as {MLL} and non-{MLL} gene rearrangement positive pediatric {ALL}. {I}n addition, seven other {B}-precursor {ALL} cases not bearing t(9;22) or t(11q23)/{MLL} chromosomal aberrations were analyzed. {U}sing top differentially expressed genes, hierarchical cluster and principal component analyses demonstrate that the genetically more heterogeneous {B}-precursor {ALL} samples intercalate with {BCR}-{ABL}-positive cases, but were clearly distinct from {T}-{ALL} and {MLL} profiles. {S}imilar expression signatures were observed for both heterogeneous {B}-precursor {ALL} and for {BCR}-{ABL}-positive cases. {A}s an unrelated laboratory, we demonstrate that gene signatures defined for childhood {ALL} were also capable of stratifying distinct subtypes in our cohort of adult {ALL} patients. {A}s such, previously reported gene expression patterns identified by microarray technology are validated and confirmed on truly independent leukemia patient samples.}, doi = {10.1038/sj.leu.2403167}, pdf = {../local/Kohlmann2004Pediatric.pdf}, file = {Kohlmann2004Pediatric.pdf:local/Kohlmann2004Pediatric.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1038/sj.leu.2403167} }
@article{Koike2004Prediction, author = {Koike, A. and Takagi, T.}, title = {Prediction of protein-protein interaction sites using support vector machines}, journal = {Protein {E}ng. {D}es. {S}el.}, year = {2004}, volume = {17}, pages = {165-173}, number = {2}, month = {Feb}, abstract = {The identification of protein-protein interaction sites is essential for the mutant design and prediction of protein-protein networks. {T}he interaction sites of residue units were predicted using support vector machines ({SVM}) and the profiles of sequentially/spatially neighboring residues, plus additional information. {W}hen only sequence information was used, prediction performance was highest using the feature vectors, sequentially neighboring profiles and predicted interaction site ratios, which were calculated by {SVM} regression using amino acid compositions. {W}hen structural information was also used, prediction performance was highest using the feature vectors, spatially neighboring residue profiles, accessible surface areas, and the with/without protein interaction sites ratios predicted by {SVM} regression and amino acid compositions. {I}n the latter case, the precision at recall = 50% was 54-56% for a homo-hetero mixed test set and >20% higher than for random prediction. {A}pproximately 30% of the residues wrongly predicted as interaction sites were the closest sequentially/spatially neighboring on the interaction site residues. {T}he predicted residues covered 86-87% of the actual interfaces (96-97% of interfaces with over 20 residues). {T}his prediction performance appeared to be slightly higher than a previously reported study. {C}omparing the prediction accuracy of each molecule, it seems to be easier to predict interaction sites for stable complexes.}, doi = {10.1093/protein/gzh020}, pdf = {../local/Koike2004Prediction.pdf}, file = {Koike2004Prediction.pdf:local/Koike2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/protein/gzh020} }
@article{Komura2005Multidimensional, author = {Komura, D. and Nakamura, H. and Tsutsumi, S. and Aburatani, H. and Ihara, S.}, title = {Multidimensional support vector machines for visualization of gene expression data}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {439-444}, number = {4}, month = {Feb}, abstract = {Motivation: {S}ince {DNA} microarray experiments provide us with huge amount of gene expression data, they should be analyzed with statistical methods to extract the meanings of experimental results. {S}ome dimensionality reduction methods such as {P}rincipal {C}omponent {A}nalysis ({PCA}) are used to roughly visualize the distribution of high dimensional gene expression data. {H}owever, in the case of binary classification of gene expression data, {PCA} does not utilize class information when choosing axes. {T}hus clearly separable data in the original space may not be so in the reduced space used in {PCA}.{R}esults: {F}or visualization and class prediction of gene expression data, we have developed a new {SVM}-based method called multidimensional {SVM}s, that generate multiple orthogonal axes. {T}his method projects high dimensional data into lower dimensional space to exhibit properties of the data clearly and to visualize a distribution of the data roughly. {F}urthermore, the multiple axes can be used for class prediction. {T}he basic properties of conventional {SVM}s are retained in our method: solutions of mathematical programming are sparse, and nonlinear classification is implemented implicitly through the use of kernel functions. {T}he application of our method to the experimentally obtained gene expression datasets for patients' samples indicates that our algorithm is efficient and useful for visualization and class prediction.}, doi = {10.1093/bioinformatics/bti188}, pdf = {../local/Komura2005Multidimensional.pdf}, file = {Komura2005Multidimensional.pdf:local/Komura2005Multidimensional.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti188v1} }
@incollection{Kondor2004Diffusion, author = {Kondor, R. and Vert, J.-P.}, title = {Diffusion kernels}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {171-192}, pdf = {../local/saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF;saigo.pdf:http\}, file = {saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF;saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@inproceedings{Kondor2002Diffusion, author = {R. I. Kondor and J. Lafferty}, title = {Diffusion kernels on graphs and other discrete input}, booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning}, year = {2002}, pages = {315--322}, address = {San Francisco, CA, USA}, publisher = {Morgan Kaufmann Publishers Inc.}, pdf = {../local/Kondor2002Diffusion.pdf}, file = {Kondor2002Diffusion.pdf:Kondor2002Diffusion.pdf:PDF}, keywords = {biosvm}, subject = {kernelnet} }
@article{Kote-Jarai2004Gene, author = {Zsofia Kote-Jarai and Richard D Williams and Nicola Cattini and Maria Copeland and Ian Giddings and Richard Wooster and Robert H tePoele and Paul Workman and Barry Gusterson and John Peacock and Gerald Gui and Colin Campbell and Ros Eeles}, title = {Gene expression profiling after radiation-induced {DNA} damage is strongly predictive of {BRCA}1 mutation carrier status.}, journal = {Clin. {C}ancer {R}es.}, year = {2004}, volume = {10}, pages = {958-63}, number = {3}, month = {Feb}, abstract = {P{URPOSE}: {T}he impact of the presence of a germ-line {BRCA}1 mutation on gene expression in normal breast fibroblasts after radiation-induced {DNA} damage has been investigated. {EXPERIMENTAL} {DESIGN}: {H}igh-density c{DNA} microarray technology was used to identify differential responses to {DNA} damage in fibroblasts from nine heterozygous {BRCA}1 mutation carriers compared with five control samples without personal or family history of any cancer. {F}ibroblast cultures were irradiated, and their expression profile was compared using intensity ratios of the c{DNA} microarrays representing 5603 {IMAGE} clones. {RESULTS}: {C}lass comparison and class prediction analysis has shown that {BRCA}1 mutation carriers can be distinguished from controls with high probability (approximately 85\%). {S}ignificance analysis of microarrays and the support vector machine classifier identified gene sets that discriminate the samples according to their mutation status. {T}hese include genes already known to interact with {BRCA}1 such as {CDKN}1{B}, {ATR}, and {RAD}51. {CONCLUSIONS}: {T}he results of this initial study suggest that normal cells from heterozygous {BRCA}1 mutation carriers display a different gene expression profile from controls in response to {DNA} damage. {A}daptations of this pilot result to other cell types could result in the development of a functional assay for {BRCA}1 mutation status.}, pdf = {../local/Kote-Jarai2004Gene.pdf}, file = {Kote-Jarai2004Gene.pdf:local/Kote-Jarai2004Gene.pdf:PDF}, keywords = {biosvm , breastcancer}, url = {http://clincancerres.aacrjournals.org/cgi/content/abstract/10/3/958} }
@article{Kramer2002Fragment, author = {S. Kramer and E. Frank and C. Helma}, title = {Fragment generation and support vector machines for inducing {SAR}s.}, journal = {S{AR} {QSAR} {E}nviron {R}es}, year = {2002}, volume = {13}, pages = {509-23}, number = {5}, month = {Jul}, abstract = {We present a new approach to the induction of {SAR}s based on the generation of structural fragments and support vector machines ({SVM}s). {I}t is tailored for bio-chemical databases, where the examples are two-dimensional descriptions of chemical compounds. {T}he fragment generator finds all fragments (i.e. linearly connected atoms) that satisfy user-specified constraints regarding their frequency and generality. {I}n this paper, we are querying for fragments within a minimum and a maximum frequency in the dataset. {A}fter fragment generation, we propose to apply {SVM}s to the problem of inducing {SAR}s from these fragments. {W}e conjecture that the {SVM}s are particularly useful in this context, as they can deal with a large number of features. {E}xperiments in the domains of carcinogenicity and mutagenicity prediction show that the minimum and the maximum frequency queries for fragments can be answered within a reasonable time, and that the predictive accuracy obtained using these fragments is satisfactory. {H}owever, further experiments will have to confirm that this is a viable approach to inducing {SAR}s.}, doi = {10.1080/10629360290023340}, keywords = {biosvm}, url = {http://dx.doi.org/10.1080/10629360290023340} }
@article{Krishnan2003comparative, author = {Krishnan, V. G. and Westhead, D. R.}, title = {A comparative study of machine-learning methods to predict the effects of single nucleotide polymorphisms on protein function}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {2199-2209}, number = {17}, abstract = {Motivation: {T}he large volume of single nucleotide polymorphism data now available motivates the development of methods for distinguishing neutral changes from those which have real biological effects. {H}ere, two different machine-learning methods, decision trees and support vector machines ({SVM}s), are applied for the first time to this problem. {I}n common with most other methods, only non-synonymous changes in protein coding regions of the genome are considered. {R}esults: {I}n detailed cross-validation analysis, both learning methods are shown to compete well with existing methods, and to out-perform them in some key tests. {SVM}s show better generalization performance, but decision trees have the advantage of generating interpretable rules with robust estimates of prediction confidence. {I}t is shown that the inclusion of protein structure information produces more accurate methods, in agreement with other recent studies, and the effect of using predicted rather than actual structure is evaluated. {A}vailability: {S}oftware is available on request from the authors.}, pdf = {../local/Krishnan2003comparative.pdf}, file = {Krishnan2003comparative.pdf:local/Krishnan2003comparative.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/17/2199} }
@incollection{Krishnapuram2004Gene, author = {Krishnapuram, B. and Carin, L. and Hartemink, A.}, title = {Gene expression analysis: joint feature selection and classifier design}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {299-317}, pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\}, file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Krishnapuram2004Joint, author = {Krishnapuram, B. and Carin, L. and Hartemink, A.}, title = {Joint {C}lassifier and {F}eature {O}ptimization for {C}omprehensive {C}ancer {D}iagnosis {U}sing {G}ene {E}xpression {D}ata}, journal = {J. {C}omput. {B}iol.}, year = {2004}, volume = {11}, pages = {227-242}, number = {2-3}, abstract = {ecent research has demonstrated quite convincingly that accurate cancer diagnosis can be achieved by constructing classifiers that are designed to compare the gene expression profile of a tissue of unknown cancer status to a database of stored expression profiles from tissues of known cancer status. {T}his paper introduces the {JCFO}, a novel algorithm that uses a sparse {B}ayesian approach to jointly identify both the optimal nonlinear classifier for diagnosis and the optimal set of genes on which to base that diagnosis. {W}e show that the diagnostic classification accuracy of the proposed algorithm is superior to a number of current state-of-the-art methods in a full leave-one-out cross-validation study of five widely used benchmark datasets. {I}n addition to its superior classification accuracy, the algorithm is designed to automatically identify a small subset of genes (typically around twenty in our experiments) that are capable of providing complete discriminatory information for diagnosis. {F}ocusing attention on a small subset of genes is useful not only because it produces a classifier with good generalization capacity, but also because this set of genes may provide insights into the mechanisms responsible for the disease itself. {A} number of the genes identified by the {JCFO} in our experiments are already in use as clinical markers for cancer diagnosis; some of the remaining genes may be excellent candidates for further clinical investigation. {I}f it is possible to identify a small set of genes that is indeed capable of providing complete discrimination, inexpensive diagnostic assays might be widely deployable in clinical settings.}, doi = {10.1089/1066527041410463}, pdf = {../local/Krishnapuram2004Joint.pdf}, file = {Krishnapuram2004Joint.pdf:local/Krishnapuram2004Joint.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1089/1066527041410463} }
@article{Krishnapuram2004bayesian, author = {Krishnapuram, B. and Hartemink, A. J. and Carin, L. and Figueiredo, M. A. T.}, title = {A bayesian approach to joint feature selection and classifier design}, journal = {IEEE T. Pattern. Anal.}, year = {2004}, volume = {26}, pages = {1105-11}, number = {9}, month = {Sep}, abstract = {This paper adopts a {B}ayesian approach to simultaneously learn both an optimal nonlinear classifier and a subset of predictor variables (or features) that are most relevant to the classification task. {T}he approach uses heavy-tailed priors to promote sparsity in the utilization of both basis functions and features; these priors act as regularizers for the likelihood function that rewards good classification on the training data. {W}e derive an expectation-maximization ({EM}) algorithm to efficiently compute a maximum a posteriori ({MAP}) point estimate of the various parameters. {T}he algorithm is an extension of recent state-of-the-art sparse {B}ayesian classifiers, which in turn can be seen as {B}ayesian counterparts of support vector machines. {E}xperimental comparisons using kernel classifiers demonstrate both parsimonious feature selection and excellent classification accuracy on a range of synthetic and benchmark data sets.}, doi = {10.1109/TPAMI.2004.55}, pdf = {../local/Krishnapuram2004bayesian.pdf}, file = {Krishnapuram2004bayesian.pdf:local/Krishnapuram2004bayesian.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1109/TPAMI.2004.55} }
@article{Kuang2005Profile-based, author = {Kuang, R. and Ie, E. and Wang, K. and Wang, K. and Siddiqi, M. and Freund, Y. and Leslie, C.}, title = {Profile-based string kernels for remote homology detection and motif extraction.}, journal = {J. Bioinform. Comput. Biol.}, year = {2005}, volume = {3}, pages = {527--550}, number = {3}, month = {Jun}, abstract = {We introduce novel profile-based string kernels for use with support vector machines (SVMs) for the problems of protein classification and remote homology detection. These kernels use probabilistic profiles, such as those produced by the PSI-BLAST algorithm, to define position-dependent mutation neighborhoods along protein sequences for inexact matching of k-length subsequences ("k-mers") in the data. By use of an efficient data structure, the kernels are fast to compute once the profiles have been obtained. For example, the time needed to run PSI-BLAST in order to build the profiles is significantly longer than both the kernel computation time and the SVM training time. We present remote homology detection experiments based on the SCOP database where we show that profile-based string kernels used with SVM classifiers strongly outperform all recently presented supervised SVM methods. We further examine how to incorporate predicted secondary structure information into the profile kernel to obtain a small but significant performance improvement. We also show how we can use the learned SVM classifier to extract "discriminative sequence motifs"--short regions of the original profile that contribute almost all the weight of the SVM classification score--and show that these discriminative motifs correspond to meaningful structural features in the protein data. The use of PSI-BLAST profiles can be seen as a semi-supervised learning technique, since PSI-BLAST leverages unlabeled data from a large sequence database to build more informative profiles. Recently presented "cluster kernels" give general semi-supervised methods for improving SVM protein classification performance. We show that our profile kernel results also outperform cluster kernels while providing much better scalability to large datasets.}, keywords = {biosvm}, owner = {vert}, pii = {S021972000500120X}, pmid = {16108083}, timestamp = {2007.08.01} }
@article{Kuang2004Profile-based, author = {Kuang, R. and Ie, E. and Wang, K. and Wang, K. and Siddiqi, M. and Freund, Y. and Leslie, C.}, title = {Profile-based string kernels for remote homology detection and motif extraction.}, journal = {Proc IEEE Comput Syst Bioinform Conf}, year = {2004}, pages = {152--160}, abstract = {We introduce novel profile-based string kernels for use with support vector machines (SVMs) for the problems of protein classification and remote homology detection. These kernels use probabilistic profiles, such as those produced by the PSI-BLAST algorithm, to define position-dependent mutation neighborhoods along protein sequences for inexact matching of k-length subsequences ("k-mers") in the data. By use of an efficient data structure, the kernels are fast to compute once the profiles have been obtained. For example, the time needed to run PSI-BLAST in order to build the pro- files is significantly longer than both the kernel computation time and the SVM training time. We present remote homology detection experiments based on the SCOP database where we show that profile-based string kernels used with SVM classifiers strongly outperform all recently presented supervised SVM methods. We also show how we can use the learned SVM classifier to extract "discriminative sequence motifs" -- short regions of the original profile that contribute almost all the weight of the SVM classification score -- and show that these discriminative motifs correspond to meaningful structural features in the protein data. The use of PSI-BLAST profiles can be seen as a semi-supervised learning technique, since PSI-BLAST leverages unlabeled data from a large sequence database to build more informative profiles. Recently presented "cluster kernels" give general semi-supervised methods for improving SVM protein classification performance. We show that our profile kernel results are comparable to cluster kernels while providing much better scalability to large datasets.}, keywords = {biosvm}, owner = {vert}, pmid = {16448009}, timestamp = {2007.08.01} }
@article{Kuang2004Protein, author = {Kuang, R. and Leslie, C. S. and Yang, A.-S.}, title = {Protein backbone angle prediction with machine learning approaches}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {1612-1621}, number = {10}, abstract = {Motivation: {P}rotein backbone torsion angle prediction provides useful local structural information that goes beyond conventional three-state ({alpha}, {beta} and coil) secondary structure predictions. {A}ccurate prediction of protein backbone torsion angles will substantially improve modeling procedures for local structures of protein sequence segments, especially in modeling loop conformations that do not form regular structures as in {alpha}-helices or {beta}-strands. {R}esults: {W}e have devised two novel automated methods in protein backbone conformational state prediction: one method is based on support vector machines ({SVM}s); the other method combines a standard feed-forward back-propagation artificial neural network ({NN}) with a local structure-based sequence profile database ({LSBSP}1). {E}xtensive benchmark experiments demonstrate that both methods have improved the prediction accuracy rate over the previously published methods for conformation state prediction when using an alphabet of three or four states. {A}vailability: {LSBSP}1 and the {NN} algorithm have been implemented in {P}r{ISM}.1, which is available from www.columbia.edu/~ay1/. {S}upplementary information: {S}upplementary data for the {SVM} method can be downloaded from the {W}ebsite www.cs.columbia.edu/compbio/backbone.}, pdf = {../local/Kuang2004Protein.pdf}, file = {Kuang2004Protein.pdf:local/Kuang2004Protein.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/10/1612} }
@article{Kumar2005BhairPred, author = {Kumar, M. and Bhasin, M. and Natt, N. K. and Raghava, G. P. S.}, title = {Bhair{P}red: prediction of beta-hairpins in a protein from multiple alignment information using {ANN} and {SVM} techniques.}, journal = {Nucleic {A}cids {R}es}, year = {2005}, volume = {33}, pages = {W154-9}, number = {Web Server issue}, month = {Jul}, abstract = {This paper describes a method for predicting a supersecondary structural motif, beta-hairpins, in a protein sequence. {T}he method was trained and tested on a set of 5102 hairpins and 5131 non-hairpins, obtained from a non-redundant dataset of 2880 proteins using the {DSSP} and {PROMOTIF} programs. {T}wo machine-learning techniques, an artificial neural network ({ANN}) and a support vector machine ({SVM}), were used to predict beta-hairpins. {A}n accuracy of 65.5\% was achieved using {ANN} when an amino acid sequence was used as the input. {T}he accuracy improved from 65.5 to 69.1\% when evolutionary information ({PSI}-{BLAST} profile), observed secondary structure and surface accessibility were used as the inputs. {T}he accuracy of the method further improved from 69.1 to 79.2\% when the {SVM} was used for classification instead of the {ANN}. {T}he performances of the methods developed were assessed in a test case, where predicted secondary structure and surface accessibility were used instead of the observed structure. {T}he highest accuracy achieved by the {SVM} based method in the test case was 77.9\%. {A} maximum accuracy of 71.1\% with {M}atthew's correlation coefficient of 0.41 in the test case was obtained on a dataset previously used by {X}. {C}ruz, {E}. {G}. {H}utchinson, {A}. {S}hephard and {J}. {M}. {T}hornton (2002) {P}roc. {N}atl {A}cad. {S}ci. {USA}, 99, 11157-11162. {T}he performance of the method was also evaluated on proteins used in the '6th community-wide experiment on the critical assessment of techniques for protein structure prediction ({CASP}6)'. {B}ased on the algorithm described, a web server, {B}hair{P}red (http://www.imtech.res.in/raghava/bhairpred/), has been developed, which can be used to predict beta-hairpins in a protein using the {SVM} approach.}, doi = {doi:10.1093/nar/gki588}, pdf = {../local/Kumar2005BhairPred.pdf}, file = {Kumar2005BhairPred.pdf:local/Kumar2005BhairPred.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/doi:10.1093/nar/gki588} }
@incollection{Lanckriet2004Kernel-based, author = {Lanckriet, G.R.G. and Cristianini, N. and Jordan, M.I. and Noble, W.S.}, title = {Kernel-based integration of genomic data using semidefinite programming}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {231-259}, keywords = {biosvm}, owner = {vert} }
@inproceedings{Lanckriet2004Kernel-baseda, author = {Lanckriet, G.R. and Deng, M. and Cristianini, N. and Jordan, M.I. and Noble, W.S.}, title = {Kernel-based data fusion and its application to protein function prediction in yeast.}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing}, year = {2004}, pages = {300-311}, abstract = {Kernel methods provide a principled framework in which to represent many types of data, including vectors, strings, trees and graphs. {A}s such, these methods are useful for drawing inferences about biological phenomena. {W}e describe a method for combining multiple kernel representations in an optimal fashion, by formulating the problem as a convex optimization problem that can be solved using semidefinite programming techniques. {T}he method is applied to the problem of predicting yeast protein functional classifications using a support vector machine ({SVM}) trained on five types of data. {F}or this problem, the new method performs better than a previously-described {M}arkov random field method, and better than the {SVM} trained on any single type of data.}, pdf = {../local/Lanckriet2004Kernel-baseda.pdf}, file = {Lanckriet2004Kernel-baseda.pdf:local/Lanckriet2004Kernel-baseda.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Lanckriet2004statistical, author = {Lanckriet, G. R. G. and De Bie, T. and Cristianini, N. and Jordan, M. I. and Noble, W. S.}, title = {A statistical framework for genomic data fusion}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {2626-2635}, number = {16}, abstract = {Motivation: {D}uring the past decade, the new focus on genomics has highlighted a particular challenge: to integrate the different views of the genome that are provided by various types of experimental data. {R}esults: {T}his paper describes a computational framework for integrating and drawing inferences from a collection of genome-wide measurements. {E}ach dataset is represented via a kernel function, which defines generalized similarity relationships between pairs of entities, such as genes or proteins. {T}he kernel representation is both flexible and efficient, and can be applied to many different types of data. {F}urthermore, kernel functions derived from different types of data can be combined in a straightforward fashion. {R}ecent advances in the theory of kernel methods have provided efficient algorithms to perform such combinations in a way that minimizes a statistical loss function. {T}hese methods exploit semidefinite programming techniques to reduce the problem of finding optimizing kernel combinations to a convex optimization problem. {C}omputational experiments performed using yeast genome-wide datasets, including amino acid sequences, hydropathy profiles, gene expression data and known protein-protein interactions, demonstrate the utility of this approach. {A} statistical learning algorithm trained from all of these data to recognize particular classes of proteins--membrane proteins and ribosomal proteins--performs significantly better than the same algorithm trained on any single type of data. {A}vailability: {S}upplementary data at http://noble.gs.washington.edu/proj/sdp-svm}, doi = {10.1093/bioinformatics/bth294}, pdf = {../local/Lanckriet2004statistical.pdf}, file = {Lanckriet2004statistical.pdf:local/Lanckriet2004statistical.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/16/2626} }
@article{Lee2003Discovery, author = {Dongkwon Lee and Sang Wook Choi and Myengsoo Kim and Jin Hyun Park and Moonkyu Kim and Jungchul Kim and In-Beum Lee}, title = {Discovery of differentially expressed genes related to histological subtype of hepatocellular carcinoma.}, journal = {Biotechnol {P}rog.}, year = {2003}, volume = {19}, pages = {1011-5}, number = {3}, abstract = {Hepatocellular carcinoma ({HCC}) is one of the most common human malignancies in the world. {T}o identify the histological subtype-specific genes of {HCC}, we analyzed the gene expression profile of 10 {HCC} patients by means of c{DNA} microarray. {W}e proposed a systematic approach for determining the discriminatory genes and revealing the biological phenomena of {HCC} with c{DNA} microarray data. {F}irst, normalization of c{DNA} microarray data was performed to reduce or minimize systematic variations. {O}n the basis of the suitably normalized data, we identified specific genes involved in histological subtype of {HCC}. {T}wo classification methods, {F}isher's discriminant analysis ({FDA}) and support vector machine ({SVM}), were used to evaluate the reliability of the selected genes and discriminate the histological subtypes of {HCC}. {T}his study may provide a clue for the needs of different chemotherapy and the reason for heterogeneity of the clinical responses according to histological subtypes.}, doi = {10.1021/bp025746a}, pdf = {../local/Lee2003Discovery.pdf}, file = {Lee2003Discovery.pdf:local/Lee2003Discovery.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/bp025746a} }
@article{Lee2003Classification, author = {Lee, Y. and Lee, C.-K.}, title = {Classification of multiple cancer types by multicategory support vector machines using gene expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1132-1139}, number = {9}, abstract = {Motivation: {H}igh-density {DNA} microarray measures the activities of several thousand genes simultaneously and the gene expression profiles have been used for the cancer classification recently. {T}his new approach promises to give better therapeutic measurements to cancer patients by diagnosing cancer types with improved accuracy. {T}he {S}upport {V}ector {M}achine ({SVM}) is one of the classification methods successfully applied to the cancer diagnosis problems. {H}owever, its optimal extension to more than two classes was not obvious, which might impose limitations in its application to multiple tumor types. {W}e briefly introduce the {M}ulticategory {SVM}, which is a recently proposed extension of the binary {SVM}, and apply it to multiclass cancer diagnosis problems {R}esults: {I}ts applicability is demonstrated on the leukemia data ({G}olub et al., 1999) and the small round blue cell tumors of childhood data ({K}han et al., 2001). {C}omparable classification accuracy shown in the applications and its flexibility render the {MSVM} a viable alternative to other classification methods {S}upplementary {I}nformation: http://www.stat.ohio-state.edu/~yklee/msvm.html {C}ontact: yklee@stat.ohio-state.edu}, pdf = {../local/Lee2003Classification.pdf}, file = {Lee2003Classification.pdf:local/Lee2003Classification.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/9/1132} }
@inproceedings{Leslie2002spectrum, author = {Leslie, C. and Eskin, E. and Noble, W.S.}, title = {The spectrum kernel: a string kernel for {SVM} protein classification}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002}, year = {2002}, editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin Lauerdale and Teri E. Klein}, pages = {564--575}, address = {Singapore}, publisher = {World Scientific}, pdf = {../local/lesl02.pdf}, file = {lesl02.pdf:local/lesl02.pdf:PDF}, keywords = {biosvm}, subject = {biokernel} }
@inproceedings{Leslie2003Mismatch, author = {Leslie, C. and Eskin, E. and Weston, J. and Noble, W.S.}, title = {Mismatch {S}tring {K}ernels for {SVM} {P}rotein {C}lassification}, booktitle = {Advances in {N}eural {I}nformation {P}rocessing {S}ystems 15}, year = {2003}, editor = {Suzanna Becker and Sebastian Thrun and Klaus Obermayer}, publisher = {MIT Press}, pdf = {../local/lesl02b.pdf}, file = {lesl02b.pdf:local/lesl02b.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.cs.columbia.edu/~cleslie/papers/mismatch-short.pdf} }
@article{Leslie2004Mismatch, author = {Leslie, C. S. and Eskin, E. and Cohen, A. and Weston, J. and Noble, W. S.}, title = {Mismatch string kernels for discriminative protein classification}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {467-476}, number = {4}, abstract = {Motivation: {C}lassification of proteins sequences into functional and structural families based on sequence homology is a central problem in computational biology. {D}iscriminative supervised machine learning approaches provide good performance, but simplicity and computational efficiency of training and prediction are also important concerns. {R}esults: {W}e introduce a class of string kernels, called mismatch kernels, for use with support vector machines ({SVM}s) in a discriminative approach to the problem of protein classification and remote homology detection. {T}hese kernels measure sequence similarity based on shared occurrences of fixed-length patterns in the data, allowing for mutations between patterns. {T}hus, the kernels provide a biologically well-motivated way to compare protein sequences without relying on family-based generative models such as hidden {M}arkov models. {W}e compute the kernels efficiently using a mismatch tree data structure, allowing us to calculate the contributions of all patterns occurring in the data in one pass while traversing the tree. {W}hen used with an {SVM}, the kernels enable fast prediction on test sequences. {W}e report experiments on two benchmark {SCOP} datasets, where we show that the mismatch kernel used with an {SVM} classifier performs competitively with state-of-the-art methods for homology detection, particularly when very few training examples are available. {E}xamination of the highest-weighted patterns learned by the {SVM} classifier recovers biologically important motifs in protein families and superfamilies. {A}vailability: {SVM} software is publicly available at http://microarray.cpmc.columbia.edu/gist. {M}ismatch kernel software is available upon request.}, pdf = {../local/Leslie2004Mismatch.pdf}, file = {Leslie2004Mismatch.pdf:local/Leslie2004Mismatch.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/4/467} }
@article{Lett2004Interaction, author = {Lett, D. and Hsing, M. and Pio, F.}, title = {Interaction profile-based protein classification of death domain}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {75}, abstract = {Background {T}he increasing number of protein sequences and 3{D} structure obtained from genomic initiatives is leading many of us to focus on proteomics, and to dedicate our experimental and computational efforts on the creation and analysis of information derived from 3{D} structure. {I}n particular, the high-throughput generation of protein-protein interaction data from a few organisms makes such an approach very important towards understanding the molecular recognition that make-up the entire protein-protein interaction network. {S}ince the generation of sequences, and experimental protein-protein interactions increases faster than the 3{D} structure determination of protein complexes, there is tremendous interest in developing in silico methods that generate such structure for prediction and classification purposes. {I}n this study we focused on classifying protein family members based on their protein-protein interaction distinctiveness. {S}tructure-based classification of protein-protein interfaces has been described initially by {P}onstingl et al. [1] and more recently by {V}aldar et al. [2] and {M}intseris et al. [3], from complex structures that have been solved experimentally. {H}owever, little has been done on protein classification based on the prediction of protein-protein complexes obtained from homology modeling and docking simulation. {R}esults {W}e have developed an in silico classification system entitled {HODOCO} ({H}omology modeling, {D}ocking and {C}lassification {O}racle), in which protein {R}esidue {P}otential {I}nteraction {P}rofiles ({RPIPS}) are used to summarize protein-protein interaction characteristics. {T}his system applied to a dataset of 64 proteins of the death domain superfamily was used to classify each member into its proper subfamily. {T}wo classification methods were attempted, heuristic and support vector machine learning. {B}oth methods were tested with a 5-fold cross-validation. {T}he heuristic approach yielded a 61% average accuracy, while the machine learning approach yielded an 89% average accuracy. {C}onclusion {W}e have confirmed the reliability and potential value of classifying proteins via their predicted interactions. {O}ur results are in the same range of accuracy as other studies that classify protein-protein interactions from 3{D} complex structure obtained experimentally. {W}hile our classification scheme does not take directly into account sequence information our results are in agreement with functional and sequence based classification of death domain family members.}, doi = {10.1186/1471-2105-5-75}, pdf = {../local/Lett2004Interaction.pdf}, file = {Lett2004Interaction.pdf:local/Lett2004Interaction.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/5/75} }
@article{Li2005Prediction, author = {H. Li and C. Ung and C. Yap and Y. Xue and Z. Li and Z. Cao and Y. Chen}, title = {Prediction of genotoxicity of chemical compounds by statistical learning methods.}, journal = {Chem. {R}es. {T}oxicol.}, year = {2005}, volume = {18}, pages = {1071-1080}, number = {6}, month = {Jun}, abstract = {Various toxicological profiles, such as genotoxic potential, need to be studied in drug discovery processes and submitted to the drug regulatory authorities for drug safety evaluation. {A}s part of the effort for developing low cost and efficient adverse drug reaction testing tools, several statistical learning methods have been used for developing genotoxicity prediction systems with an accuracy of up to 73.8\% for genotoxic ({GT}+) and 92.8\% for nongenotoxic ({GT}-) agents. {T}hese systems have been developed and tested by using less than 400 known {GT}+ and {GT}- agents, which is significantly less in number and diversity than the 860 {GT}+ and {GT}- agents known at present. {T}here is a need to examine if a similar level of accuracy can be achieved for the more diverse set of molecules and to evaluate other statistical learning methods not yet applied to genotoxicity prediction. {T}his work is intended for testing several statistical learning methods by using 860 {GT}+ and {GT}- agents, which include support vector machines ({SVM}), probabilistic neural network ({PNN}), k-nearest neighbor (k-{NN}), and {C}4.5 decision tree ({DT}). {A} feature selection method, recursive feature elimination, is used for selecting molecular descriptors relevant to genotoxicity study. {T}he overall accuracies of {SVM}, k-{NN}, and {PNN} are comparable to and those of {DT} lower than the results from earlier studies, with {SVM} giving the highest accuracies of 77.8\% for {GT}+ and 92.7\% for {GT}- agents. {O}ur study suggests that statistical learning methods, particularly {SVM}, k-{NN}, and {PNN}, are useful for facilitating the prediction of genotoxic potential of a diverse set of molecules.}, doi = {10.1021/tx049652h}, pdf = {../local/Li2005Prediction.pdf}, file = {Li2005Prediction.pdf:local/Li2005Prediction.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/tx049652h} }
@article{Li2005robust, author = {Li, L. and Jiang, W. and Li, X. and Moser, K.L. and Guo, Z. and Du, L. and Wang, Q. and Topol, E.J. and Wang, Q. and Rao, S.}, title = {A robust hybrid between genetic algorithm and support vector machine for extracting an optimal feature gene subset}, journal = {Genomics}, year = {2005}, volume = {85}, pages = {16-23}, number = {1}, abstract = {Development of a robust and efficient approach for extracting useful information from microarray data continues to be a significant and challenging task. {M}icroarray data are characterized by a high dimension, high signal-to-noise ratio, and high correlations between genes, but with a relatively small sample size. {C}urrent methods for dimensional reduction can further be improved for the scenario of the presence of a single (or a few) high influential gene(s) in which its effect in the feature subset would prohibit inclusion of other important genes. {W}e have formalized a robust gene selection approach based on a hybrid between genetic algorithm and support vector machine. {T}he major goal of this hybridization was to exploit fully their respective merits (e.g., robustness to the size of solution space and capability of handling a very large dimension of feature genes) for identification of key feature genes (or molecular signatures) for a complex biological phenotype. {W}e have applied the approach to the microarray data of diffuse large {B} cell lymphoma to demonstrate its behaviors and properties for mining the high-dimension data of genome-wide gene expression profiles. {T}he resulting classifier(s) (the optimal gene subset(s)) has achieved the highest accuracy (99%) for prediction of independent microarray samples in comparisons with marginal filters and a hybrid between genetic algorithm and {K} nearest neighbors.}, doi = {10.1016/j.ygeno.2004.09.007}, pdf = {../local/Li2005robust.pdf}, file = {Li2005robust.pdf:local/Li2005robust.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.ygeno.2004.09.007} }
@article{Li2004Data, author = {Li, L. and Tang, H. and Wu, Z. and Gong, J. and Gruidl, M. and Zou, J. and Tockman, M. and Clark, R.A.}, title = {Data mining techniques for cancer detection using serum proteomic profiling.}, journal = {Artif. {I}ntell. {M}ed.}, year = {2004}, volume = {32}, pages = {71-83}, number = {2}, abstract = {O{BJECTIVE}: {P}athological changes in an organ or tissue may be reflected in proteomic patterns in serum. {I}t is possible that unique serum proteomic patterns could be used to discriminate cancer samples from non-cancer ones. {D}ue to the complexity of proteomic profiling, a higher order analysis such as data mining is needed to uncover the differences in complex proteomic patterns. {T}he objectives of this paper are (1) to briefly review the application of data mining techniques in proteomics for cancer detection/diagnosis; (2) to explore a novel analytic method with different feature selection methods; (3) to compare the results obtained on different datasets and that reported by {P}etricoin et al. in terms of detection performance and selected proteomic patterns. {METHODS} {AND} {MATERIAL}: {T}hree serum {SELDI} {MS} data sets were used in this research to identify serum proteomic patterns that distinguish the serum of ovarian cancer cases from non-cancer controls. {A} support vector machine-based method is applied in this study, in which statistical testing and genetic algorithm-based methods are used for feature selection respectively. {L}eave-one-out cross validation with receiver operating characteristic ({ROC}) curve is used for evaluation and comparison of cancer detection performance. {RESULTS} {AND} {CONCLUSIONS}: {T}he results showed that (1) data mining techniques can be successfully applied to ovarian cancer detection with a reasonably high performance; (2) the classification using features selected by the genetic algorithm consistently outperformed those selected by statistical testing in terms of accuracy and robustness; (3) the discriminatory features (proteomic patterns) can be very different from one selection method to another. {I}n other words, the pattern selection and its classification efficiency are highly classifier dependent. {T}herefore, when using data mining techniques, the discrimination of cancer from normal does not depend solely upon the identity and origination of cancer-related proteins.}, doi = {10.1016/j.artmed.2004.03.006}, pdf = {../local/Li2004Data.pdf}, file = {Li2004Data.pdf:local/Li2004Data.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.artmed.2004.03.006} }
@article{Li2004comparative, author = {Li, T. and Zhang, C. and Ogihara, M.}, title = {A comparative study of feature selection and multiclass classification methods for tissue classification based on gene expression}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {2429-2437}, number = {15}, abstract = {Summary: {T}his paper studies the problem of building multiclass classifiers for tissue classification based on gene expression. {T}he recent development of microarray technologies has enabled biologists to quantify gene expression of tens of thousands of genes in a single experiment. {B}iologists have begun collecting gene expression for a large number of samples. {O}ne of the urgent issues in the use of microarray data is to develop methods for characterizing samples based on their gene expression. {T}he most basic step in the research direction is binary sample classification, which has been studied extensively over the past few years. {T}his paper investigates the next step--multiclass classification of samples based on gene expression. {T}he characteristics of expression data (e.g. large number of genes with small sample size) makes the classification problem more challenging. {T}he process of building multiclass classifiers is divided into two components: (i) selection of the features (i.e. genes) to be used for training and testing and (ii) selection of the classification method. {T}his paper compares various feature selection methods as well as various state-of-the-art classification methods on various multiclass gene expression datasets. {O}ur study indicates that multiclass classification problem is much more difficult than the binary one for the gene expression datasets. {T}he difficulty lies in the fact that the data are of high dimensionality and that the sample size is small. {T}he classification accuracy appears to degrade very rapidly as the number of classes increases. {I}n particular, the accuracy was very low regardless of the choices of the methods for large-class datasets (e.g. {NCI}60 and {GCM}). {W}hile increasing the number of samples is a plausible solution to the problem of accuracy degradation, it is important to develop algorithms that are able to analyze effectively multiple-class expression data for these special datasets.}, pdf = {../local/Li2004comparative.pdf}, file = {Li2004comparative.pdf:local/Li2004comparative.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/15/2429} }
@article{Liao2003Combining, author = {Liao, L. and Noble, W.S.}, title = {Combining {P}airwise {S}equence {S}imilarity and {S}upport {V}ector {M}achines for {D}etecting {R}emote {P}rotein {E}volutionary and {S}tructural {R}elationships}, journal = {J. {C}omput. {B}iol.}, year = {2003}, volume = {10}, pages = {857-868}, number = {6}, abstract = {One key element in understanding the molecular machinery of the cell is to understand the structure and function of each protein encoded in the genome. {A} very successful means of inferring the structure or function of a previously unannotated protein is via sequence similarity with one or more proteins whose structure or function is already known. {T}oward this end, we propose a means of representing proteins using pairwise sequence similarity scores. {T}his representation, combined with a discriminative classification algorithm known as the support vector machine ({SVM}), provides a powerful means of detecting subtle structural and evolutionary relationships among proteins. {T}he algorithm, called {SVM}-pairwise, when tested on its ability to recognize previously unseen families from the {SCOP} database, yields significantly better performance than {SVM}-{F}isher, profile {HMM}s, and {PSI}-{BLAST}.}, pdf = {../local/Liao2003Combining.pdf}, file = {Liao2003Combining.pdf:local/Liao2003Combining.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.liebertonline.com/doi/abs/10.1089/106652703322756113} }
@inproceedings{Liao2002Combining, author = {Liao, L. and Noble, W. S.}, title = {Combining pairwise sequence similarity and support vector machines for remote protein homology detection}, booktitle = {Proceedings of the {S}ixth {I}nternational {C}onference on {C}omputational {M}olecular {B}iology}, year = {2002}, pdf = {../local/liao02.pdf}, file = {liao02.pdf:local/liao02.pdf:PDF}, keywords = {biosvm}, subject = {biokernelcasp}, url = {http://www.cs.columbia.edu/~bgrundy/papers/fps-svm.html} }
@article{Lin2002Conserved, author = {Lin, K. and Kuang, Y. and Joseph, J. S. and Kolatkar, P. R.}, title = {Conserved codon composition of ribosomal protein coding genes in {E}scherichia coli, {M}ycobacterium tuberculosis and {S}accharomyces cerevisiae: lessons from supervised machine learning in functional genomics}, journal = {Nucl. {A}cids {R}es.}, year = {2002}, volume = {30}, pages = {2599-2607}, number = {11}, abstract = {Genomics projects have resulted in a flood of sequence data. {F}unctional annotation currently relies almost exclusively on inter-species sequence comparison and is restricted in cases of limited data from related species and widely divergent sequences with no known homologs. {H}ere, we demonstrate that codon composition, a fusion of codon usage bias and amino acid composition signals, can accurately discriminate, in the absence of sequence homology information, cytoplasmic ribosomal protein genes from all other genes of known function in {S}accharomyces cerevisiae, {E}scherichia coli and {M}ycobacterium tuberculosis using an implementation of support vector machines, {SVM}light. {A}nalysis of these codon composition signals is instructive in determining features that confer individuality to ribosomal protein genes. {E}ach of the sets of positively charged, negatively charged and small hydrophobic residues, as well as codon bias, contribute to their distinctive codon composition profile. {T}he representation of all these signals is sensitively detected, combined and augmented by the {SVM}s to perform an accurate classification. {O}f special mention is an obvious outlier, yeast gene {RPL}22{B}, highly homologous to {RPL}22{A} but employing very different codon usage, perhaps indicating a non-ribosomal function. {F}inally, we propose that codon composition be used in combination with other attributes in gene/protein classification by supervised machine learning algorithms.}, pdf = {../local/Lin2002Conserved.pdf}, file = {Lin2002Conserved.pdf:local/Lin2002Conserved.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://nar.oupjournals.org/cgi/content/abstract/30/11/2599} }
@article{Lind2003Support, author = {P. Lind and T. Maltseva}, title = {Support vector machines for the estimation of aqueous solubility.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {1855-9}, number = {6}, abstract = {Support {V}ector {M}achines ({SVM}s) are used to estimate aqueous solubility of organic compounds. {A} {SVM} equipped with a {T}animoto similarity kernel estimates solubility with accuracy comparable to results from other reported methods where the same data sets have been studied. {C}omplete cross-validation on a diverse data set resulted in a root-mean-squared error = 0.62 and {R}(2) = 0.88. {T}he data input to the machine is in the form of molecular fingerprints. {N}o physical parameters are explicitly involved in calculations.}, doi = {10.1021/ci034107s}, pdf = {../local/Lind2003Support.pdf}, file = {Lind2003Support.pdf:local/Lind2003Support.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci034107s} }
@article{Listgarten2004Predictive, author = {Listgarten, J. and Damaraju, S. and Poulin, B. and Cook, L. and Dufour, J. and Driga, A. and Mackey, J. and Wishart, D. and Greiner, R. and Zanke, B.}, title = {Predictive {M}odels for {B}reast {C}ancer {S}usceptibility from {M}ultiple {S}ingle {N}ucleotide {P}olymorphisms}, journal = {Clin. {C}ancer {R}es.}, year = {2004}, volume = {10}, pages = {2725-2737}, number = {8}, abstract = {Hereditary predisposition and causative environmental exposures have long been recognized in human malignancies. {I}n most instances, cancer cases occur sporadically, suggesting that environmental influences are critical in determining cancer risk. {T}o test the influence of genetic polymorphisms on breast cancer risk, we have measured 98 single nucleotide polymorphisms ({SNP}s) distributed over 45 genes of potential relevance to breast cancer etiology in 174 patients and have compared these with matched normal controls. {U}sing machine learning techniques such as support vector machines ({SVM}s), decision trees, and naive {B}ayes, we identified a subset of three {SNP}s as key discriminators between breast cancer and controls. {T}he {SVM}s performed maximally among predictive models, achieving 69% predictive power in distinguishing between the two groups, compared with a 50% baseline predictive power obtained from the data after repeated random permutation of class labels (individuals with cancer or controls). {H}owever, the simpler naive {B}ayes model as well as the decision tree model performed quite similarly to the {SVM}. {T}he three {SNP} sites most useful in this model were (a) the +4536{T}/{C} site of the aldosterone synthase gene {CYP}11{B}2 at amino acid residue 386 {V}al/{A}la ({T}/{C}) (rs4541); (b) the +4328{C}/{G} site of the aryl hydrocarbon hydroxylase {CYP}1{B}1 at amino acid residue 293 {L}eu/{V}al ({C}/{G}) (rs5292); and (c) the +4449{C}/{T} site of the transcription factor {BCL}6 at amino acid 387 {A}sp/{A}sp (rs1056932). {N}o single {SNP} site on its own could achieve more than 60% in predictive accuracy. {W}e have shown that multiple {SNP} sites from different genes over distant parts of the genome are better at identifying breast cancer patients than any one {SNP} alone. {A}s high-throughput technology for {SNP}s improves and as more {SNP}s are identified, it is likely that much higher predictive accuracy will be achieved and a useful clinical tool developed.}, eprint = {http://clincancerres.aacrjournals.org/cgi/reprint/10/8/2725.pdf}, pdf = {../local/Listgarten2004Predictive.pdf}, file = {Listgarten2004Predictive.pdf:local/Listgarten2004Predictive.pdf:PDF}, keywords = {biosvm, breastcancer}, owner = {jeanphilippevert}, url = {http://clincancerres.aacrjournals.org/cgi/content/abstract/10/8/2725} }
@article{Liu2004Using, author = {Huiqing Liu and Hao Han and Jinyan Li and Limsoon Wong}, title = {Using amino acid patterns to accurately predict translation initiation sites.}, journal = {In {S}ilico {B}iol.}, year = {2004}, volume = {4}, pages = {255-69}, number = {3}, abstract = {The translation initiation site ({TIS}) prediction problem is about how to correctly identify {TIS} in m{RNA}, c{DNA}, or other types of genomic sequences. {H}igh prediction accuracy can be helpful in a better understanding of protein coding from nucleotide sequences. {T}his is an important step in genomic analysis to determine protein coding from nucleotide sequences. {I}n this paper, we present an in silico method to predict translation initiation sites in vertebrate c{DNA} or m{RNA} sequences. {T}his method consists of three sequential steps as follows. {I}n the first step, candidate features are generated using k-gram amino acid patterns. {I}n the second step, a small number of top-ranked features are selected by an entropy-based algorithm. {I}n the third step, a classification model is built to recognize true {TIS}s by applying support vector machines or ensembles of decision trees to the selected features. {W}e have tested our method on several independent data sets, including two public ones and our own extracted sequences. {T}he experimental results achieved are better than those reported previously using the same data sets. {O}ur high accuracy not only demonstrates the feasibility of our method, but also indicates that there might be "amino acid" patterns around {TIS} in c{DNA} and m{RNA} sequences.}, keywords = {biosvm}, pii = {2004040022}, url = {http://www.bioinfo.de/isb/2004/04/0022/} }
@article{Liu2003in-silico, author = {Huiqing Liu and Hao Han and Jinyan Li and Limsoon Wong}, title = {An in-silico method for prediction of polyadenylation signals in human sequences.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2003}, volume = {14}, pages = {84-93}, abstract = {This paper presents a machine learning method to predict polyadenylation signals ({PAS}es) in human {DNA} and m{RNA} sequences by analysing features around them. {T}his method consists of three sequential steps of feature manipulation: generation, selection and integration of features. {I}n the first step, new features are generated using k-gram nucleotide acid or amino acid patterns. {I}n the second step, a number of important features are selected by an entropy-based algorithm. {I}n the third step, support vector machines are employed to recognize true {PAS}es from a large number of candidates. {O}ur study shows that true {PAS}es in {DNA} and m{RNA} sequences can be characterized by different features, and also shows that both upstream and downstream sequence elements are important for recognizing {PAS}es from {DNA} sequences. {W}e tested our method on several public data sets as well as our own extracted data sets. {I}n most cases, we achieved better validation results than those reported previously on the same data sets. {T}he important motifs observed are highly consistent with those reported in literature.}, keywords = {biosvm} }
@article{Liu2005Use, author = {Huiqing Liu and Jinyan Li and Limsoon Wong}, title = {Use of extreme patient samples for outcome prediction from gene expression data.}, journal = {Bioinformatics}, year = {2005}, month = {Jun}, abstract = {M{OTIVATION}: {P}atient outcome prediction using microarray technologies is an important application in bioinformatics. {B}ased on patients' genotypic microarray data, predictions are made to estimate patients' survival time and their risk of tumor metastasis or recurrence. {S}o, accurate prediction can potentially help to provide better treatment for patients. {RESULTS}: {W}e present a new computational method for patient outcome prediction. {I}n the training phase of this method, we make use of two types of extreme patient samples: short-term survivors who got an unfavorable outcome within a short period and long-term survivors who were maintaining a favorable outcome after a long follow-up time. {T}hese extreme training samples yield a clear platform for us to identify relevant genes whose expression is closely related to the outcome. {T}he selected extreme samples and the relevant genes are then integrated by a support vector machine to build a prediction model, by which each validation sample is assigned a risk score that falls into one of special pre-defined risk groups. {W}e apply this method to several public data sets. {I}n most cases, patients in high and low risk groups stratified by our method have clearly distinguishable outcome status as seen in their {K}aplan-{M}eier curves. {W}e also show that the idea of selecting only extreme patient samples for training is effective for improving the prediction accuracy when different gene selection methods are used. {SUPPLEMENTARY} {INFORMATION}: http://research.i2r.a-star.edu.sg/huiqing/supplementaldata/survival/survival.html.}, doi = {10.1093/bioinformatics/bti544}, pdf = {../local/Liu2005Use.pdf}, file = {Liu2005Use.pdf:local/Liu2005Use.pdf:PDF}, keywords = {biosvm}, pii = {bti544}, url = {http://dx.doi.org/10.1093/bioinformatics/bti544} }
@article{Liu2004Quantitative, author = {H. X. Liu and C. X. Xue and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Quantitative prediction of logk of peptides in high-performance liquid chromatography based on molecular descriptors by using the heuristic method and support vector machine.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1979-86}, number = {6}, abstract = {A new method support vector machine ({SVM}) and the heuristic method ({HM}) were used to develop the nonlinear and linear models between the capacity factor (logk) and seven molecular descriptors of 75 peptides for the first time. {T}he molecular descriptors representing the structural features of the compounds only included the constitutional and topological descriptors, which can be obtained easily without optimizing the structure of the molecule. {T}he seven molecular descriptors selected by the heuristic method in {CODESSA} were used as inputs for {SVM}. {T}he results obtained by {SVM} were compared with those obtained by the heuristic method. {T}he prediction result of the {SVM} model is better than that of heuristic method. {F}or the test set, a predictive correlation coefficient {R} = 0.9801 and root-mean-square error of 0.1523 were obtained. {T}he prediction results are in very good agreement with the experimental values. {B}ut the linear model of the heuristic method is easier to understand and ready to use for a chemist. {T}his paper provided a new and effective method for predicting the chromatography retention of peptides and some insight into the structural features which are related to the capacity factor of peptides.}, doi = {10.1021/ci049891a}, pdf = {../local/Liu2004Quantitative.pdf}, file = {Liu2004Quantitative.pdf:local/Liu2004Quantitative.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049891a} }
@article{Liu2004Prediction, author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Prediction of the isoelectric point of an amino acid based on {GA}-{PLS} and {SVM}s.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {161-7}, number = {1}, abstract = {The support vector machine ({SVM}), as a novel type of a learning machine, for the first time, was used to develop a {QSPR} model that relates the structures of 35 amino acids to their isoelectric point. {M}olecular descriptors calculated from the structure alone were used to represent molecular structures. {T}he seven descriptors selected using {GA}-{PLS}, which is a sophisticated hybrid approach that combines {GA} as a powerful optimization method with {PLS} as a robust statistical method for variable selection, were used as inputs of {RBFNN}s and {SVM} to predict the isoelectric point of an amino acid. {T}he optimal {QSPR} model developed was based on support vector machines, which showed the following results: the root-mean-square error of 0.2383 and the prediction correlation coefficient {R}=0.9702 were obtained for the whole data set. {S}atisfactory results indicated that the {GA}-{PLS} approach is a very effective method for variable selection, and the support vector machine is a very promising tool for the nonlinear approximation.}, doi = {10.1021/ci034173u}, pdf = {../local/Liu2004Prediction.pdf}, file = {Liu2004Prediction.pdf:local/Liu2004Prediction.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci034173u} }
@article{Liu2004QSAR, author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Q{SAR} and classification models of a novel series of {COX}-2 selective inhibitors: 1,5-diarylimidazoles based on support vector machines.}, journal = {J {C}omput {A}ided {M}ol {D}es}, year = {2004}, volume = {18}, pages = {389-99}, number = {6}, month = {Jun}, abstract = {The support vector machine, which is a novel algorithm from the machine learning community, was used to develop quantitation and classification models which can be used as a potential screening mechanism for a novel series of {COX}-2 selective inhibitors. {E}ach compound was represented by calculated structural descriptors that encode constitutional, topological, geometrical, electrostatic, and quantum-chemical features. {T}he heuristic method was then used to search the descriptor space and select the descriptors responsible for activity. {Q}uantitative modelling results in a nonlinear, seven-descriptor model based on {SVM}s with root mean-square errors of 0.107 and 0.136 for training and prediction sets, respectively. {T}he best classification results are found using {SVM}s: the accuracy for training and test sets is 91.2\% and 88.2\%, respectively. {T}his paper proposes a new and effective method for drug design and screening.}, keywords = {biosvm chemoinformatics} }
@article{Liu2003QSAR, author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Q{SAR} study of ethyl 2-[(3-methyl-2,5-dioxo(3-pyrrolinyl))amino]-4-(trifluoromethyl) pyrimidine-5-carboxylate: an inhibitor of {AP}-1 and {NF}-kappa {B} mediated gene expression based on support vector machines.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {1288-96}, number = {4}, abstract = {The support vector machine, as a novel type of learning machine, for the first time, was used to develop a {QSAR} model of 57 analogues of ethyl 2-[(3-methyl-2,5-dioxo(3-pyrrolinyl))amino]-4-(trifluoromethyl)pyrimidine-5-carboxylate ({EPC}), an inhibitor of {AP}-1 and {NF}-kappa {B} mediated gene expression, based on calculated quantum chemical parameters. {T}he quantum chemical parameters involved in the model are {K}ier and {H}all index (order3) ({KHI}3), {I}nformation content (order 0) ({IC}0), {YZ} {S}hadow ({YZS}) and {M}ax partial charge for an {N} atom ({M}ax{PCN}), {M}in partial charge for an {N} atom ({M}in{PCN}). {T}he mean relative error of the training set, the validation set, and the testing set is 1.35\%, 1.52\%, and 2.23\%, respectively, and the maximum relative error is less than 5.00\%.}, doi = {10.1021/ci0340355}, pdf = {../local/Liu2003QSAR.pdf}, file = {Liu2003QSAR.pdf:local/Liu2003QSAR.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci0340355} }
@article{Liu2005Multiclass, author = {Jane Jijun Liu and Gene Cutler and Wuxiong Li and Zheng Pan and Sihua Peng and Tim Hoey and Liangbiao Chen and Xuefeng Bruce Ling}, title = {Multiclass cancer classification and biomarker discovery using {GA}-based algorithms.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2691-7}, number = {11}, month = {Jun}, abstract = {M{OTIVATION}: {T}he development of microarray-based high-throughput gene profiling has led to the hope that this technology could provide an efficient and accurate means of diagnosing and classifying tumors, as well as predicting prognoses and effective treatments. {H}owever, the large amount of data generated by microarrays requires effective reduction of discriminant gene features into reliable sets of tumor biomarkers for such multiclass tumor discrimination. {T}he availability of reliable sets of biomarkers, especially serum biomarkers, should have a major impact on our understanding and treatment of cancer. {RESULTS}: {W}e have combined genetic algorithm ({GA}) and all paired ({AP}) support vector machine ({SVM}) methods for multiclass cancer categorization. {P}redictive features can be automatically determined through iterative {GA}/{SVM}, leading to very compact sets of non-redundant cancer-relevant genes with the best classification performance reported to date. {I}nterestingly, these different classifier sets harbor only modest overlapping gene features but have similar levels of accuracy in leave-one-out cross-validations ({LOOCV}). {F}urther characterization of these optimal tumor discriminant features, including the use of nearest shrunken centroids ({NSC}), analysis of annotations and literature text mining, reveals previously unappreciated tumor subclasses and a series of genes that could be used as cancer biomarkers. {W}ith this approach, we believe that microarray-based multiclass molecular analysis can be an effective tool for cancer biomarker discovery and subsequent molecular cancer diagnosis.}, doi = {10.1093/bioinformatics/bti419}, pdf = {../local/Liu2005Multiclass.pdf}, file = {Liu2005Multiclass.pdf:local/Liu2005Multiclass.pdf:PDF}, keywords = {biosvm}, pii = {bti419}, url = {http://dx.doi.org/10.1093/bioinformatics/bti419} }
@article{Liu2004Active, author = {Liu, Y.}, title = {Active learning with support vector machine applied to gene expression data for cancer classification}, journal = {J. {C}hem. {I}nf. {C}omput. {S}ci.}, year = {2004}, volume = {44}, pages = {1936-1941}, number = {6}, abstract = {There is growing interest in the application of machine learning techniques in bioinformatics. {T}he supervised machine learning approach has been widely applied to bioinformatics and gained a lot of success in this research area. {W}ith this learning approach researchers first develop a large training set, which is a time-consuming and costly process. {M}oreover, the proportion of the positive examples and negative examples in the training set may not represent the real-world data distribution, which causes concept drift. {A}ctive learning avoids these problems. {U}nlike most conventional learning methods where the training set used to derive the model remains static, the classifier can actively choose the training data and the size of training set increases. {W}e introduced an algorithm for performing active learning with support vector machine and applied the algorithm to gene expression profiles of colon cancer, lung cancer, and prostate cancer samples. {W}e compared the classification performance of active learning with that of passive learning. {T}he results showed that employing the active learning method can achieve high accuracy and significantly reduce the need for labeled training instances. {F}or lung cancer classification, to achieve 96% of the total positives, only 31 labeled examples were needed in active learning whereas in passive learning 174 labeled examples were required. {T}hat meant over 82% reduction was realized by active learning. {I}n active learning the areas under the receiver operating characteristic ({ROC}) curves were over 0.81, while in passive learning the areas under the {ROC} curves were below 0.50.}, doi = {10.1021/ci049810a}, pdf = {../local/Liu2004Active.pdf}, file = {Liu2004Active.pdf:local/Liu2004Active.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1021/ci049810a} }
@article{Liu2004comparative, author = {Y. Liu}, title = {A comparative study on feature selection methods for drug discovery.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1823-8}, number = {5}, abstract = {Feature selection is frequently used as a preprocessing step to machine learning. {T}he removal of irrelevant and redundant information often improves the performance of learning algorithms. {T}his paper is a comparative study of feature selection in drug discovery. {T}he focus is on aggressive dimensionality reduction. {F}ive methods were evaluated, including information gain, mutual information, a chi2-test, odds ratio, and {GSS} coefficient. {T}wo well-known classification algorithms, {N}aïve {B}ayesian and {S}upport {V}ector {M}achine ({SVM}), were used to classify the chemical compounds. {T}he results showed that {N}aïve {B}ayesian benefited significantly from the feature selection, while {SVM} performed better when all features were used. {I}n this experiment, information gain and chi2-test were most effective feature selection methods. {U}sing information gain with a {N}aïve {B}ayesian classifier, removal of up to 96\% of the features yielded an improved classification accuracy measured by sensitivity. {W}hen information gain was used to select the features, {SVM} was much less sensitive to the reduction of feature space. {T}he feature set size was reduced by 99\%, while losing only a few percent in terms of sensitivity (from 58.7\% to 52.5\%) and specificity (from 98.4\% to 97.2\%). {I}n contrast to information gain and chi2-test, mutual information had relatively poor performance due to its bias toward favoring rare features and its sensitivity to probability estimation errors.}, doi = {10.1021/ci049875d}, pdf = {../local/Liu2004comparative.pdf}, file = {Liu2004comparative.pdf:local/Liu2004comparative.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049875d} }
@article{Liu2005Gene, author = {Zhenqiu Liu and Dechang Chen and Halima Bensmail}, title = {Gene expression data classification with kernel principal component analysis.}, journal = {J {B}iomed {B}iotechnol}, year = {2005}, volume = {2005}, pages = {155-9}, number = {2}, abstract = {One important feature of the gene expression data is that the number of genes ${M}$ far exceeds the number of samples ${N}$ . {S}tandard statistical methods do not work well when ${N} < {M}$ . {D}evelopment of new methodologies or modification of existing methodologies is needed for the analysis of the microarray data. {I}n this paper, we propose a novel analysis procedure for classifying the gene expression data. {T}his procedure involves dimension reduction using kernel principal component analysis ({KPCA}) and classification with logistic regression (discrimination). {KPCA} is a generalization and nonlinear version of principal component analysis. {T}he proposed algorithm was applied to five different gene expression datasets involving human tumor samples. {C}omparison with other popular classification methods such as support vector machines and neural networks shows that our algorithm is very promising in classifying gene expression data.}, doi = {10.1155/JBB.2005.155}, pdf = {../local/Liu2005Gene.pdf}, file = {Liu2005Gene.pdf:local/Liu2005Gene.pdf:PDF}, keywords = {biosvm}, pii = {S1110724304406032_THIS_PII_IS_INCORRECT_}, url = {http://dx.doi.org/10.1155/JBB.2005.155} }
@article{Lo2005Effect, author = {Siaw Ling Lo and Cong Zhong Cai and Yu Zong Chen and Maxey C M Chung}, title = {Effect of training datasets on support vector machine prediction of protein-protein interactions.}, journal = {Proteomics}, year = {2005}, volume = {5}, pages = {876-84}, number = {4}, month = {Mar}, abstract = {Knowledge of protein-protein interaction is useful for elucidating protein function via the concept of 'guilt-by-association'. {A} statistical learning method, {S}upport {V}ector {M}achine ({SVM}), has recently been explored for the prediction of protein-protein interactions using artificial shuffled sequences as hypothetical noninteracting proteins and it has shown promising results ({B}ock, {J}. {R}., {G}ough, {D}. {A}., {B}ioinformatics 2001, 17, 455-460). {I}t remains unclear however, how the prediction accuracy is affected if real protein sequences are used to represent noninteracting proteins. {I}n this work, this effect is assessed by comparison of the results derived from the use of real protein sequences with that derived from the use of shuffled sequences. {T}he real protein sequences of hypothetical noninteracting proteins are generated from an exclusion analysis in combination with subcellular localization information of interacting proteins found in the {D}atabase of {I}nteracting {P}roteins. {P}rediction accuracy using real protein sequences is 76.9\% compared to 94.1\% using artificial shuffled sequences. {T}he discrepancy likely arises from the expected higher level of difficulty for separating two sets of real protein sequences than that for separating a set of real protein sequences from a set of artificial sequences. {T}he use of real protein sequences for training a {SVM} classification system is expected to give better prediction results in practical cases. {T}his is tested by using both {SVM} systems for predicting putative protein partners of a set of thioredoxin related proteins. {T}he prediction results are consistent with observations, suggesting that real sequence is more practically useful in development of {SVM} classification system for facilitating protein-protein interaction prediction.}, doi = {10.1002/pmic.200401118}, pdf = {../local/Lo2005Effect.pdf}, file = {Lo2005Effect.pdf:local/Lo2005Effect.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1002/pmic.200401118} }
@article{Lodhi2002Text, author = {Lodhi, H. and Saunders, C. and Shawe-Taylor, J. and Cristianini, N. and Watkins, C.je n'ai pas vraiment d'éléments de réponse.}, title = {Text classification using string kernels}, journal = {J. {M}ach. {L}earn. {R}es.}, year = {2002}, volume = {2}, pages = {419--444}, pdf = {../local/lodh02.pdf}, file = {lodh02.pdf:local/lodh02.pdf:PDF}, keywords = {biosvm}, subject = {kernel}, url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/lodhi02a/abstract.html} }
@inproceedings{Lodhi2000Text, author = {Lodhi, H. and Shawe-Taylor, J. and Cristianini, N. and Watkins, C. J. C. H.}, title = {Text {C}lassification using {S}tring {K}ernels}, booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.}, year = {2000}, pages = {563-569}, pdf = {../local/lodh00.pdf}, file = {lodh00.pdf:local/lodh00.pdf:PDF}, keywords = {biosvm}, subject = {kernel}, url = {http://www.neurocolt.com/tech_reps/2000/00079.ps.gz} }
@techreport{Logan2001Study, author = {Logan, B. and Moreno, P. and Suzek, B. and Weng, Z. and Kasif, S.}, title = {A {S}tudy of {R}emote {H}omology {D}etection}, institution = {Compaq Cambridge Research laboratory}, year = {2001}, number = {CRL 2001/05}, month = {June}, abstract = {Functional annotation of newly sequenced genomes is an important challenge for computational biology systems. {W}hile much progress has been made towards scalingup experimental methods for functional assignment to putative genes, most current genomic annotation systems rely on computational solutions for homology modeling via sequence or structural similarity. {W}e present a new method for remote homology detection that relies on combining probabilistic modeling and supervised learning in high-dimensional features spaces. {O}ur system uses a transformation that converts protein domains to fixed-dimension representative feature vectors, where each feature records the sensitivity of each protein domain to a previously learned set of ?protein motifs? or ?blocks?. {S}ubsequently, the system utilizes {S}upport {V}ector {M}achine ({SVM}) classifiers to learn the boundaries between structural protein classes. {O}ur experiments suggest that this technique performs well relative to several other remote homology methods for the majority of protein domains in {SCOP} 1.37 {PDB}90.}, pdf = {../local/Logan2001Study.pdf}, file = {Logan2001Study.pdf:local/Logan2001Study.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Lu2003Expression, author = {Lu, Y.J. and Williamson, D. and Wang, R. and Summersgill, B. and Rodriguez, S. and Rogers, S. and Pritchard-Jones, K. and Campbell, C. and Shipley, J.}, title = {Expression profiling targeting chromosomes for tumor classification and prediction of clinical behavior.}, journal = {Genes {C}hromosomes {C}ancer}, year = {2003}, volume = {38}, pages = {207-214}, number = {3}, abstract = {Tumors are associated with altered or deregulated gene products that affect critical cellular functions. {H}ere we assess the use of a global expression profiling technique that identifies chromosome regions corresponding to differential gene expression, termed comparative expressed sequence hybridization ({CESH}). {CESH} analysis was performed on a total of 104 tumors with a diagnosis of rhabdomyosarcoma, leiomyosarcoma, prostate cancer, and favorable-histology {W}ilms tumors. {T}hrough the use of the chromosome regions identified as variables, support vector machine analysis was applied to assess classification potential, and feature selection (recursive feature elimination) was used to identify the best discriminatory regions. {W}e demonstrate that the {CESH} profiles have characteristic patterns in tumor groups and were also able to distinguish subgroups of rhabdomyosarcoma. {T}he overall {CESH} profiles in favorable-histology {W}ilms tumors were found to correlate with subsequent clinical behavior. {C}lassification by use of {CESH} profiles was shown to be similar in performance to previous microarray expression studies and highlighted regions for further investigation. {W}e conclude that analysis of chromosomal expression profiles can group, subgroup, and even predict clinical behavior of tumors to a level of performance similar to that of microarray analysis. {CESH} is independent of selecting sequences for interrogation and is a simple, rapid, and widely accessible approach to identify clinically useful differential expression.}, doi = {10.1002/gcc.10276}, pdf = {../local/Lu2003Expression.pdf}, file = {Lu2003Expression.pdf:local/Lu2003Expression.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Luan2005Classification, author = {Feng Luan and Ruisheng Zhang and Chunyan Zhao and Xiaojun Yao and Mancang Liu and Zhide Hu and Botao Fan}, title = {Classification of the carcinogenicity of {N}-nitroso compounds based on support vector machines and linear discriminant analysis.}, journal = {Chem {R}es {T}oxicol}, year = {2005}, volume = {18}, pages = {198-203}, number = {2}, month = {Feb}, abstract = {The support vector machine ({SVM}), as a novel type of learning machine, was used to develop a classification model of carcinogenic properties of 148 {N}-nitroso compounds. {T}he seven descriptors calculated solely from the molecular structures of compounds selected by forward stepwise linear discriminant analysis ({LDA}) were used as inputs of the {SVM} model. {T}he obtained results confirmed the discriminative capacity of the calculated descriptors. {T}he result of {SVM} (total accuracy of 95.2\%) is better than that of {LDA} (total accuracy of 89.8\%).}, doi = {10.1021/tx049782q}, pdf = {../local/Luan2005Classification.pdf}, file = {Luan2005Classification.pdf:local/Luan2005Classification.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/tx049782q} }
@inproceedings{Mahe2004Extensions, author = {Mah{\'e}, P. and Ueda, N. and Akutsu, T. and Perret, J.-L. and Vert, J.-P.}, title = {Extensions of marginalized graph kernels}, booktitle = {Proceedings of the {T}wenty-{F}irst {I}nternational {C}onference on {M}achine {L}earning ({ICML} 2004)}, year = {2004}, editor = {Greiner, R. and Schuurmans, D.}, pages = {552-559}, publisher = {ACM Press}, abstract = {Positive definite kernels between labeled graphs have recently been proposed.{T}hey enable the application of kernel methods, such as support vectormachines, to the analysis and classification of graphs, for example, chemicalcompounds. {T}hese graph kernels are obtained by marginalizing a kernel betweenpaths with respect to a random walk model on the graph vertices along theedges. {W}e propose two extensions of these graph kernels, with the double goal toreduce their computation time and increase their relevance as measure ofsimilarity between graphs. {F}irst, we propose to modify the label of eachvertex by automatically adding information about its environment with the useof the {M}organ algorithm. {S}econd, we suggest a modification of the random walkmodel to prevent the walk from coming back to a vertex that was just visited.{T}hese extensions are then tested on benchmark experiments of chemicalcompounds classification, with promising results.}, pdf = {../local/icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF;icmlMod.pdf:http\}, file = {icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF;icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF}, keywords = {biosvm chemoinformatics}, owner = {vert} }
@article{Mahe2005Graph, author = {Mah{\'e}, P. and Ueda, N. and Akutsu, T. and Perret, J.-L. and Vert, J.-P.}, title = {Graph kernels for molecular structure-activity relationship analysis with support vector machines}, journal = {J. Chem. Inf. Model.}, year = {2005}, volume = {45}, pages = {939-51}, number = {4}, abstract = {The support vector machine algorithm together with graph kernel functions has recently been introduced to model structure-activity relationships ({SAR}) of molecules from their 2{D} structure, without the need for explicit molecular descriptor computation. {W}e propose two extensions to this approach with the double goal to reduce the computational burden associated with the model and to enhance its predictive accuracy: description of the molecules by a {M}organ index process and definition of a second-order {M}arkov model for random walks on 2{D} structures. {E}xperiments on two mutagenicity data sets validate the proposed extensions, making this approach a possible complementary alternative to other modeling strategies.}, doi = {10.1021/ci050039t}, pdf = {../local/Mahe2005Graph.pdf}, file = {Mahe2005Graph.pdf:local/Mahe2005Graph.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci050039t} }
@article{Man2004Evaluating, author = {Man, M.Z. and Dyson, G. and Johnson, K. and Liao, B.}, title = {Evaluating methods for classifying expression data.}, journal = {J. {B}iopharm. {S}tat.}, year = {2004}, volume = {14}, pages = {1065-1084}, number = {4}, abstract = {An attractive application of expression technologies is to predict drug efficacy or safety using expression data of biomarkers. {T}o evaluate the performance of various classification methods for building predictive models, we applied these methods on six expression datasets. {T}hese datasets were from studies using microarray technologies and had either two or more classes. {F}rom each of the original datasets, two subsets were generated to simulate two scenarios in biomarker applications. {F}irst, a 50-gene subset was used to simulate a candidate gene approach when it might not be practical to measure a large number of genes/biomarkers. {N}ext, a 2000-gene subset was used to simulate a whole genome approach. {W}e evaluated the relative performance of several classification methods by using leave-one-out cross-validation and bootstrap cross-validation. {A}lthough all methods perform well in both subsets for a relative easy dataset with two classes, differences in performance do exist among methods for other datasets. {O}verall, partial least squares discriminant analysis ({PLS}-{DA}) and support vector machines ({SVM}) outperform all other methods. {W}e suggest a practical approach to take advantage of multiple methods in biomarker applications.}, doi = {10.1081/BIP-200035491}, pdf = {../local/Man2004Evaluating.pdf}, file = {Man2004Evaluating.pdf:local/Man2004Evaluating.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Mao2005Multiclass, author = {Yong Mao and Xiaobo Zhou and Daoying Pi and Youxian Sun and Stephen T C Wong}, title = {Multiclass cancer classification by using fuzzy support vector machine and binary decision tree with gene selection.}, journal = {J {B}iomed {B}iotechnol}, year = {2005}, volume = {2005}, pages = {160-71}, number = {2}, abstract = {We investigate the problems of multiclass cancer classification with gene selection from gene expression data. {T}wo different constructed multiclass classifiers with gene selection are proposed, which are fuzzy support vector machine ({FSVM}) with gene selection and binary classification tree based on {SVM} with gene selection. {U}sing {F} test and recursive feature elimination based on {SVM} as gene selection methods, binary classification tree based on {SVM} with {F} test, binary classification tree based on {SVM} with recursive feature elimination based on {SVM}, and {FSVM} with recursive feature elimination based on {SVM} are tested in our experiments. {T}o accelerate computation, preselecting the strongest genes is also used. {T}he proposed techniques are applied to analyze breast cancer data, small round blue-cell tumors, and acute leukemia data. {C}ompared to existing multiclass cancer classifiers and binary classification tree based on {SVM} with {F} test or binary classification tree based on {SVM} with recursive feature elimination based on {SVM} mentioned in this paper, {FSVM} based on recursive feature elimination based on {SVM} can find most important genes that affect certain types of cancer with high recognition accuracy.}, doi = {10.1155/JBB.2005.160}, pdf = {../local/Mao2005Multiclass.pdf}, file = {Mao2005Multiclass.pdf:local/Mao2005Multiclass.pdf:PDF}, keywords = {biosvm}, pii = {S1110724304406044_THIS_PII_IS_INCORRECT_}, url = {http://dx.doi.org/10.1155/JBB.2005.160} }
@article{Markowetz2003Support, author = {F. Markowetz and L. Edler and M. Vingron}, title = {Support {V}ector {M}achines for {P}rotein {F}old {C}lass {P}rediction}, journal = {Biometrical {J}ournal}, year = {2003}, volume = {45}, pages = {377-389}, number = {3}, abstract = {Knowledge of the three-dimensional structure of a protein is essential for describing and understanding its function. {T}oday, a large number of known protein sequences faces a small number of identified structures. {T}hus, the need arises to predict structure from sequence without using time-consuming experimental identification. {I}n this paper the performance of {S}upport {V}ector {M}achines ({SVM}s) is compared to {N}eural {N}etworks and to standard statistical classification methods as {D}iscriminant {A}nalysis and {N}earest {N}eighbor {C}lassification. {W}e show that {SVM}s can beat the competing methods on a dataset of 268 protein sequences to be classified into a set of 42 fold classes. {W}e discuss misclassification with respect to biological function and similarity. {I}n a second step we examine the performance of {SVM}s if the embedding is varied from frequencies of single amino acids to frequencies of tripletts of amino acids. {T}his work shows that {SVM} provide a promising alternative to standard statistical classification and prediction methods in functional genomics.}, doi = {10.1002/bimj.200390019}, pdf = {../local/Markowetz2003Support.pdf}, file = {Markowetz2003Support.pdf:local/Markowetz2003Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www3.interscience.wiley.com/cgi-bin/abstract/104525729/START} }
@article{Martin2005Predicting, author = {Martin, S. and Roe, D. and Faulon, J.-L.}, title = {Predicting protein-protein interactions using signature products}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {218-226}, number = {2}, month = {Jan}, abstract = {Motivation: {P}roteome-wide prediction of protein-protein interaction is a difficult and important problem in biology. {A}lthough there have been recent advances in both experimental and computational methods for predicting protein-protein interactions, we are only beginning to see a confluence of these techniques. {I}n this paper, we describe a very general, high-throughput method for predicting protein-protein interactions. {O}ur method combines a sequence-based description of proteins with experimental information that can be gathered from any type of protein-protein interaction screen. {T}he method uses a novel description of interacting proteins by extending the signature descriptor, which has demonstrated success in predicting peptide/protein binding interactions for individual proteins. {T}his descriptor is extended to protein pairs by taking signature products. {T}he signature product is implemented within a support vector machine classifier as a kernel function. {R}esults: {W}e have applied our method to publicly available yeast, {H}elicobacter pylori, human and mouse datasets. {W}e used the yeast and {H}.pylori datasets to verify the predictive ability of our method, achieving from 70 to 80% accuracy rates using 10-fold cross-validation. {W}e used the human and mouse datasets to demonstrate that our method is capable of cross-species prediction. {F}inally, we reused the yeast dataset to explore the ability of our algorithm to predict domains. {C}ontact: smartin@sandia.gov.}, doi = {10.1093/bioinformatics/bth483}, pdf = {../local/Martin2005Predicting.pdf}, file = {Martin2005Predicting.pdf:local/Martin2005Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/21/2/218} }
@article{Matsuda2005novel, author = {Matsuda, A. and Vert, J.-P. and Saigo, H. and Ueda, N. and Toh, H. and Akutsu, T.}, title = {A novel representation of protein sequences for prediction of subcellular location using support vector machines}, journal = {Protein {S}ci.}, year = {2005}, volume = {14}, pages = {2804-2813}, number = {11}, abstract = {As the number of complete genomes rapidly increases, accurate methods to automatically predict the subcellular location of proteins are increasingly useful to help their functional annotation. {I}n order to improve the predictive accuracy of the many prediction methods developed to date, a novel representation of protein sequences is proposed. {T}his representation involves local compositions of amino acids and twin amino acids, and local frequencies of distance between successive (basic, hydrophobic, and other) amino acids. {F}or calculating the local features, each sequence is split into three parts: {N}-terminal, middle, and {C}-terminal. {T}he {N}-terminal part is further divided into four regions to consider ambiguity in the length and position of signal sequences. {W}e tested this representation with support vector machines on two data sets extracted from the {SWISS}-{PROT} database. {T}hrough fivefold cross-validation tests, overall accuracies of more than 87% and 91% were obtained for eukaryotic and prokaryotic proteins, respectively. {I}t is concluded that considering the respective features in the {N}-terminal, middle, and {C}-terminal parts is helpful to predict the subcellular location.}, doi = {10.1110/ps.051597405}, keywords = {biosvm}, url = {http://dx.doi.org/10.1110/ps.051597405} }
@article{Mattfeldt2003Classification, author = {Mattfeldt, T. and Gottfried, H.W. and Wolter, H. and Schmidt, V. and Kestler, H.A. and Mayer, J.}, title = {Classification of prostatic carcinoma with artificial neural networks using comparative genomic hybridization and quantitative stereological data}, journal = {Pathol. {R}es. {P}ract.}, year = {2003}, volume = {199}, pages = {773-784}, number = {12}, abstract = {Staging of prostate cancer is a mainstay of treatment decisions and prognostication. {I}n the present study, 50 p{T}2{N}0 and 28 p{T}3{N}0 prostatic adenocarcinomas were characterized by {G}leason grading, comparative genomic hybridization ({CGH}), and histological texture analysis based on principles of stereology and stochastic geometry. {T}he cases were classified by learning vector quantization and support vector machines. {T}he quality of classification was tested by cross-validation. {C}orrect prediction of stage from primary tumor data was possible with an accuracy of 74-80% from different data sets. {T}he accuracy of prediction was similar when the {G}leason score was used as input variable, when stereological data were used, or when a combination of {CGH} data and stereological data was used. {T}he results of classification by learning vector quantization were slightly better than those by support vector machines. {A} method is briefly sketched by which training of neural networks can be adapted to unequal sample sizes per class. {P}rogression from p{T}2 to p{T}3 prostate cancer is correlated with complex changes of the epithelial cells in terms of volume fraction, of surface area, and of second-order stereological properties. {G}enetically, this progression is accompanied by a significant global increase in losses and gains of {DNA}, and specifically by increased numerical aberrations on chromosome arms 1q, 7p, and 8p.}, doi = {10.1078/0344-0338-00496}, keywords = {biosvm, cgh}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1078/0344-0338-00496} }
@article{McKnight2003Categorization, author = {Larry McKnight and Padmini Srinivasan}, title = {Categorization of sentence types in medical abstracts.}, journal = {A{MIA} {A}nnu {S}ymp {P}roc}, year = {2003}, pages = {440-4}, abstract = {This study evaluated the use of machine learning techniques in the classification of sentence type. 7253 structured abstracts and 204 unstructured abstracts of {R}andomized {C}ontrolled {T}rials from {M}ed{LINE} were parsed into sentences and each sentence was labeled as one of four types ({I}ntroduction, {M}ethod, {R}esult, or {C}onclusion). {S}upport {V}ector {M}achine ({SVM}) and {L}inear {C}lassifier models were generated and evaluated on cross-validated data. {T}reating sentences as a simple "bag of words", the {SVM} model had an average {ROC} area of 0.92. {A}dding a feature of relative sentence location improved performance markedly for some models and overall increasing the average {ROC} to 0.95. {L}inear classifier performance was significantly worse than the {SVM} in all datasets. {U}sing the {SVM} model trained on structured abstracts to predict unstructured abstracts yielded performance similar to that of models trained with unstructured abstracts in 3 of the 4 types. {W}e conclude that classification of sentence type seems feasible within the domain of {RCT}'s. {I}dentification of sentence types may be helpful for providing context to end users or other text summarization techniques.}, keywords = {biosvm}, pii = {D030003164} }
@article{Meinicke2004Oligo, author = {Meinicke, P. and Tech, M. and Morgenstern, B. and Merkl, R.}, title = {Oligo kernels for datamining on biological sequences: a case study on prokaryotic translation initiation sites.}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {169}, abstract = {Background {K}ernel-based learning algorithms are among the most advanced machine learning methods and have been successfully applied to a variety of sequence classification tasks within the field of bioinformatics. {C}onventional kernels utilized so far do not provide an easy interpretation of the learnt representations in terms of positional and compositional variability of the underlying biological signals. {R}esults {W}e propose a kernel-based approach to datamining on biological sequences. {W}ith our method it is possible to model and analyze positional variability of oligomers of any length in a natural way. {O}n one hand this is achieved by mapping the sequences to an intuitive but high-dimensional feature space, well-suited for interpretation of the learnt models. {O}n the other hand, by means of the kernel trick we can provide a general learning algorithm for that high-dimensional representation because all required statistics can be computed without performing an explicit feature space mapping of the sequences. {B}y introducing a kernel parameter that controls the degree of position-dependency, our feature space representation can be tailored to the characteristics of the biological problem at hand. {A} regularized learning scheme enables application even to biological problems for which only small sets of example sequences are available. {O}ur approach includes a visualization method for transparent representation of characteristic sequence features. {T}hereby importance of features can be measured in terms of discriminative strength with respect to classification of the underlying sequences. {T}o demonstrate and validate our concept on a biochemically well-defined case, we analyze {E}. coli translation initiation sites in order to show that we can find biologically relevant signals. {F}or that case, our results clearly show that the {S}hine-{D}algarno sequence is the most important signal upstream a start codon. {T}he variability in position and composition we found for that signal is in accordance with previous biological knowledge. {W}e also find evidence for signals downstream of the start codon, previously introduced as transcriptional enhancers. {T}hese signals are mainly characterized by occurrences of adenine in a region of about 4 nucleotides next to the start codon. {C}onclusions {W}e showed that the oligo kernel can provide a valuable tool for the analysis of relevant signals in biological sequences. {I}n the case of translation initiation sites we could clearly deduce the most discriminative motifs and their positional variation from example sequences. {A}ttractive features of our approach are its flexibility with respect to oligomer length and position conservation. {B}y means of these two parameters oligo kernels can easily be adapted to different biological problems.}, doi = {10.1186/1471-2105-5-169}, pdf = {../local/Meinicke2004Oligo.pdf}, file = {Meinicke2004Oligo.pdf:local/Meinicke2004Oligo.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.biomedcentral.com/1471-2105/5/169} }
@article{Meireles2003Differentially, author = {Meireles, S.I. and Carvalho, A.F. and Hirata, R. and Montagnini, A.L. and Martins, W.K. and Runza, F.B. and Stolf, B.S. and Termini, L. and Neto, C.E. and Silva, R.L. and Soares, F.A. and Neves, E.J. and Reis, L.F.}, title = {Differentially expressed genes in gastric tumors identified by c{DNA} array.}, journal = {Cancer {L}ett.}, year = {2003}, volume = {190}, pages = {199-211}, number = {2}, month = {Feb}, abstract = {Using c{DNA} fragments from the {FAPESP}/l{ICR} {C}ancer {G}enome {P}roject, we constructed a c{DNA} array having 4512 elements and determined gene expression in six normal and six tumor gastric tissues. {U}sing t-statistics, we identified 80 c{DNA}s whose expression in normal and tumor samples differed more than 3.5 sample standard deviations. {U}sing {S}elf-{O}rganizing {M}ap, the expression profile of these c{DNA}s allowed perfect separation of malignant and non-malignant samples. {U}sing the supervised learning procedure {S}upport {V}ector {M}achine, we identified trios of c{DNA}s that could be used to classify samples as normal or tumor, based on single-array analysis. {F}inally, we identified genes with altered linear correlation when their expression in normal and tumor samples were compared. {F}urther investigation concerning the function of these genes could contribute to the understanding of gastric carcinogenesis and may prove useful in molecular diagnostics.}, doi = {10.1016/S0304-3835(02)00587}, pdf = {../local/Meireles2003Differentially.pdf}, file = {Meireles2003Differentially.pdf:local/Meireles2003Differentially.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0304-3835(02)00587-6} }
@article{Middendorf2004Discriminative, author = {Middendorf, M. and Ziv, E. and Adams, C. and Hom, J. and Koytcheff, R. and Levovitz, C. and Woods, G. and Chen, L. and Wiggins, C.}, title = {Discriminative topological features reveal biological network mechanisms.}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {181}, abstract = {B{ACKGROUND}: {R}ecent genomic and bioinformatic advances have motivated the development of numerous network models intending to describe graphs of biological, technological, and sociological origin. {I}n most cases the success of a model has been evaluated by how well it reproduces a few key features of the real-world data, such as degree distributions, mean geodesic lengths, and clustering coefficients. {O}ften pairs of models can reproduce these features with indistinguishable fidelity despite being generated by vastly different mechanisms. {I}n such cases, these few target features are insufficient to distinguish which of the different models best describes real world networks of interest; moreover, it is not clear a priori that any of the presently-existing algorithms for network generation offers a predictive description of the networks inspiring them. {RESULTS}: {W}e present a method to assess systematically which of a set of proposed network generation algorithms gives the most accurate description of a given biological network. {T}o derive discriminative classifiers, we construct a mapping from the set of all graphs to a high-dimensional (in principle infinite-dimensional) "word space". {T}his map defines an input space for classification schemes which allow us to state unambiguously which models are most descriptive of a given network of interest. {O}ur training sets include networks generated from 17 models either drawn from the literature or introduced in this work. {W}e show that different duplication-mutation schemes best describe the {E}. coli genetic network, the {S}. cerevisiae protein interaction network, and the {C}. elegans neuronal network, out of a set of network models including a linear preferential attachment model and a small-world model. {CONCLUSIONS}: {O}ur method is a first step towards systematizing network models and assessing their predictability, and we anticipate its usefulness for a number of communities.}, doi = {10.1186/1471-2105-5-181}, pdf = {../local/Middendorf2004Discriminative.pdf}, file = {Middendorf2004Discriminative.pdf:local/Middendorf2004Discriminative.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.biomedcentral.com/1471-2105/5/181} }
@article{Mika2004NLProt, author = {Sven Mika and Burkhard Rost}, title = {N{LP}rot: extracting protein names and sequences from papers.}, journal = {Nucleic {A}cids {R}es}, year = {2004}, volume = {32}, pages = {W634-7}, number = {Web Server issue}, month = {Jul}, abstract = {Automatically extracting protein names from the literature and linking these names to the associated entries in sequence databases is becoming increasingly important for annotating biological databases. {NLP}rot is a novel system that combines dictionary- and rule-based filtering with several support vector machines ({SVM}s) to tag protein names in {P}ub{M}ed abstracts. {W}hen considering partially tagged names as errors, {NLP}rot still reached a precision of 75\% at a recall of 76\%. {B}y many criteria our system outperformed other tagging methods significantly; in particular, it proved very reliable even for novel names. {N}ames encountered particularly frequently in {D}rosophila, such as white, wing and bizarre, constitute an obvious limitation of {NLP}rot. {O}ur method is available both as an {I}nternet server and as a program for download (http://cubic.bioc.columbia.edu/services/{NLP}rot/). {I}nput can be {P}ub{M}ed/{MEDLINE} identifiers, authors, titles and journals, as well as collections of abstracts, or entire papers.}, doi = {10.1093/nar/gkh427}, keywords = {biosvm nlp}, pii = {32/suppl_2/W634}, url = {http://dx.doi.org/10.1093/nar/gkh427} }
@article{Mika2004Protein, author = {Mika, Sven and Rost, Burkhard}, title = {Protein names precisely peeled off free text}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {i241-i247}, number = {Suppl. 1}, abstract = {Motivation: {A}utomatically identifying protein names from the scientific literature is a pre-requisite for the increasing demand in data-mining this wealth of information. {E}xisting approaches are based on dictionaries, rules and machine-learning. {H}ere, we introduced a novel system that combines a pre-processing dictionary- and rule-based filtering step with several separately trained support vector machines ({SVM}s) to identify protein names in the {MEDLINE} abstracts. {R}esults: {O}ur new tagging-system {NLP}rot is capable of extracting protein names with a precision (accuracy) of 75% at a recall (coverage) of 76% after training on a corpus, which was used before by other groups and contains 200 annotated abstracts. {F}or our estimate of sustained performance, we considered partially identified names as false positives. {O}ne important issue frequently ignored in the literature is the redundancy in evaluation sets. {W}e suggested some guidelines for removing overly inadequate overlaps between training and testing sets. {A}pplying these new guidelines, our program appeared to significantly out-perform other methods tagging protein names. {NLP}rot was so successful due to the {SVM}-building blocks that succeeded in utilizing the local context of protein names in the scientific literature. {W}e challenge that our system may constitute the most general and precise method for tagging protein names. {A}vailability: http://cubic.bioc.columbia.edu/services/nlprot/}, pdf = {../local/Mika2004Protein.pdf}, file = {Mika2004Protein.pdf:Mika2004Protein.pdf:PDF}, keywords = {biosvm nlp}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i241} }
@article{Mitsumori2005Gene, author = {Tomohiro Mitsumori and Sevrani Fation and Masaki Murata and Kouichi Doi and Hirohumi Doi}, title = {Gene/protein name recognition based on support vector machine using dictionary as features.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6 Suppl 1}, pages = {S8}, abstract = {B{ACKGROUND}: {A}utomated information extraction from biomedical literature is important because a vast amount of biomedical literature has been published. {R}ecognition of the biomedical named entities is the first step in information extraction. {W}e developed an automated recognition system based on the {SVM} algorithm and evaluated it in {T}ask 1.{A} of {B}io{C}re{A}t{I}v{E}, a competition for automated gene/protein name recognition. {RESULTS}: {I}n the work presented here, our recognition system uses the feature set of the word, the part-of-speech ({POS}), the orthography, the prefix, the suffix, and the preceding class. {W}e call these features "internal resource features", i.e., features that can be found in the training data. {A}dditionally, we consider the features of matching against dictionaries to be external resource features. {W}e investigated and evaluated the effect of these features as well as the effect of tuning the parameters of the {SVM} algorithm. {W}e found that the dictionary matching features contributed slightly to the improvement in the performance of the f-score. {W}e attribute this to the possibility that the dictionary matching features might overlap with other features in the current multiple feature setting. {CONCLUSION}: {D}uring {SVM} learning, each feature alone had a marginally positive effect on system performance. {T}his supports the fact that the {SVM} algorithm is robust on the high dimensionality of the feature vector space and means that feature selection is not required.}, doi = {10.1186/1471-2105-6-S1-S8}, pdf = {../local/Mitsumori2005Gene.pdf}, file = {Mitsumori2005Gene.pdf:local/Mitsumori2005Gene.pdf:PDF}, keywords = {biosvm nlp}, pii = {1471-2105-6-S1-S8}, url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S8} }
@article{Model2001Feature, author = {Model, F. and Adorjan, P. and Olek, A. and Piepenbrock, C.}, title = {Feature selection for {DNA} methylation based cancer classification}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {S157-S164}, number = {Supp. 1}, abstract = {Molecular portraits, such as m{RNA} expression or {DNA} methylation patterns, have been shown to be strongly correlated with phenotypical parameters. {T}hese molecular patterns can be revealed routinely on a genomic scale. {H}owever, class prediction based on these patterns is an under-determined problem, due to the extreme high dimensionality of the data compared to the usually small number of available samples. {T}his makes a reduction of the data dimensionality necessary. {H}ere we demonstrate how phenotypic classes can be predicted by combining feature selection and discriminant analysis. {B}y comparing several feature selection methods we show that the right dimension reduction strategy is of crucial importance for the classification performance. {T}he techniques are demonstrated by methylation pattern based discrimination between acute lymphoblastic leukemia and acute myeloid leukemia. {C}ontact: {F}abian.{M}odel@epigenomics.com}, pdf = {../local/Model2001Feature.pdf}, file = {Model2001Feature.pdf:local/Model2001Feature.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/suppl_1/S157} }
@article{Moler2000Analysis, author = {Moler, E. J. and Chow, M. L. and Mian, I. S.}, title = {Analysis of molecular profile data using generative and discriminative methods}, journal = {Physiol. {G}enomics}, year = {2000}, volume = {4}, pages = {109-126}, number = {2}, month = {Dec}, abstract = {A modular framework is proposed for modeling and understanding the relationships between molecular profile data and other domain knowledge using a combination of generative (here, graphical models) and discriminative [{S}upport {V}ector {M}achines ({SVM}s)] methods. {A}s illustration, naive {B}ayes models, simple graphical models, and {SVM}s were applied to published transcription profile data for 1,988 genes in 62 colon adenocarcinoma tissue specimens labeled as tumor or nontumor. {T}hese unsupervised and supervised learning methods identified three classes or subtypes of specimens, assigned tumor or nontumor labels to new specimens and detected six potentially mislabeled specimens. {T}he probability parameters of the three classes were utilized to develop a novel gene relevance, ranking, and selection method. {SVM}s trained to discriminate nontumor from tumor specimens using only the 50-200 top-ranked genes had the same or better generalization performance than the full repertoire of 1,988 genes. {A}pproximately 90 marker genes were pinpointed for use in understanding the basic biology of colon adenocarcinoma, defining targets for therapeutic intervention and developing diagnostic tools. {T}hese potential markers highlight the importance of tissue biology in the etiology of cancer. {C}omparative analysis of molecular profile data is proposed as a mechanism for predicting the physiological function of genes in instances when comparative sequence analysis proves uninformative, such as with human and yeast translationally controlled tumour protein. {G}raphical models and {SVM}s hold promise as the foundations for developing decision support systems for diagnosis, prognosis, and monitoring as well as inferring biological networks.}, pdf = {../local/Moler2000Analysis.pdf}, file = {Moler2000Analysis.pdf:local/Moler2000Analysis.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://physiolgenomics.physiology.org/cgi/content/abstract/4/2/109} }
@techreport{Mukherjee1998Support, author = {S. Mukherjee and P. Tamayo and J. P. Mesirov and D. Slonim and A. Verri and T. Poggio}, title = {Support vector machine classification of microarray data}, institution = {C.B.L.C.}, year = {1998}, number = {182}, note = {A.I. Memo 1677}, pdf = {../local/Mukherjee1998Support.pdf}, file = {Mukherjee1998Support.pdf:local/Mukherjee1998Support.pdf:PDF}, keywords = {biosvm microarray}, subject = {biokernel}, url = {http://citeseer.nj.nec.com/437379.html} }
@article{Myasnikova2002Support, author = {Myasnikova, E. and Samsonova, A. and Samsonova, M. and Reinitz, J.}, title = {Support vector regression applied to the determination of the developmental age of a {D}rosophila embryo from its segmentation gene expression patterns}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {S87-S95}, number = {Suppl. 1}, abstract = {Motivation: {I}n this paper we address the problem of the determination of developmental age of an embryo from its segmentation gene expression patterns in {D}rosophila. {R}esults: {B}y applying support vector regression we have developed a fast method for automated staging of an embryo on the basis of its gene expression pattern. {S}upport vector regression is a statistical method for creating regression functions of arbitrary type from a set of training data. {T}he training set is composed of embryos for which the precise developmental age was determined by measuring the degree of membrane invagination. {T}esting the quality of regression on the training set showed good prediction accuracy. {T}he optimal regression function was then used for the prediction of the gene expression based age of embryos in which the precise age has not been measured by membrane morphology. {M}oreover, we show that the same accuracy of prediction can be achieved when the dimensionality of the feature vector was reduced by applying factor analysis. {T}he data reduction allowed us to avoid over-fitting and to increase the efficiency of the algorithm. {A}vailability: {T}his software may be obtained from the authors. {C}ontact: samson@fn.csa.ru {K}eywords: gene expression patterns; development; embryo staging; support vector regression; segmentation genes; {D}rosophila.}, pdf = {../local/Myasnikova2002Support.pdf}, file = {Myasnikova2002Support.pdf:local/Myasnikova2002Support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/suppl_1/S87} }
@article{Mueller2005Classifying, author = {K.-R. M{\"u}ller and G. R{\"a}tsch and S. Sonnenburg and S. Mika and M. Grimm and N. Heinrich}, title = {Classifying 'drug-likeness' with {K}ernel-based learning methods.}, journal = {J {C}hem {I}nf {M}odel}, year = {2005}, volume = {45}, pages = {249-53}, number = {2}, abstract = {In this article we report about a successful application of modern machine learning technology, namely {S}upport {V}ector {M}achines, to the problem of assessing the 'drug-likeness' of a chemical from a given set of descriptors of the substance. {W}e were able to drastically improve the recent result by {B}yvatov et al. (2003) on this task and achieved an error rate of about 7\% on unseen compounds using {S}upport {V}ector {M}achines. {W}e see a very high potential of such machine learning techniques for a variety of computational chemistry problems that occur in the drug discovery and drug design process.}, doi = {10.1021/ci049737o}, pdf = {../local/Mueller2005Classifying.pdf}, file = {Mueller2005Classifying.pdf:local/Mueller2005Classifying.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci049737o} }
@article{Nair2005Mimicking, author = {Rajesh Nair and Burkhard Rost}, title = {Mimicking cellular sorting improves prediction of subcellular localization.}, journal = {J {M}ol {B}iol}, year = {2005}, volume = {348}, pages = {85-100}, number = {1}, month = {Apr}, abstract = {Predicting the native subcellular compartment of a protein is an important step toward elucidating its function. {H}ere we introduce {LOC}tree, a hierarchical system combining support vector machines ({SVM}s) and other prediction methods. {LOC}tree predicts the subcellular compartment of a protein by mimicking the mechanism of cellular sorting and exploiting a variety of sequence and predicted structural features in its input. {C}urrently {LOC}tree does not predict localization for membrane proteins, since the compositional properties of membrane proteins significantly differ from those of non-membrane proteins. {W}hile any information about function can be used by the system, we present estimates of performance that are valid when only the amino acid sequence of a protein is known. {W}hen evaluated on a non-redundant test set, {LOC}tree achieved sustained levels of 74\% accuracy for non-plant eukaryotes, 70\% for plants, and 84\% for prokaryotes. {W}e rigorously benchmarked {LOC}tree in comparison to the best alternative methods for localization prediction. {LOC}tree outperformed all other methods in nearly all benchmarks. {L}ocalization assignments using {LOC}tree agreed quite well with data from recent large-scale experiments. {O}ur preliminary analysis of a few entirely sequenced organisms, namely human ({H}omo sapiens), yeast ({S}accharomyces cerevisiae), and weed ({A}rabidopsis thaliana) suggested that over 35\% of all non-membrane proteins are nuclear, about 20\% are retained in the cytosol, and that every fifth protein in the weed resides in the chloroplast.}, doi = {10.1016/j.jmb.2005.02.025}, pdf = {../local/Nair2005Mimicking.pdf}, file = {Nair2005Mimicking.pdf:local/Nair2005Mimicking.pdf:PDF}, keywords = {biosvm}, pii = {S0022-2836(05)00177-4}, url = {http://dx.doi.org/10.1016/j.jmb.2005.02.025} }
@article{Natt2004Prediction, author = {Natt, N.K. and Kaur, H. and Raghava, G.P.}, title = {Prediction of transmembrane regions of beta-barrel proteins using {ANN}- and {SVM}-based methods.}, journal = {Proteins}, year = {2004}, volume = {56}, pages = {11-18}, number = {1}, abstract = {This article describes a method developed for predicting transmembrane beta-barrel regions in membrane proteins using machine learning techniques: artificial neural network ({ANN}) and support vector machine ({SVM}). {T}he {ANN} used in this study is a feed-forward neural network with a standard back-propagation training algorithm. {T}he accuracy of the {ANN}-based method improved significantly, from 70.4% to 80.5%, when evolutionary information was added to a single sequence as a multiple sequence alignment obtained from {PSI}-{BLAST}. {W}e have also developed an {SVM}-based method using a primary sequence as input and achieved an accuracy of 77.4%. {T}he {SVM} model was modified by adding 36 physicochemical parameters to the amino acid sequence information. {F}inally, {ANN}- and {SVM}-based methods were combined to utilize the full potential of both techniques. {T}he accuracy and {M}atthews correlation coefficient ({MCC}) value of {SVM}, {ANN}, and combined method are 78.5%, 80.5%, and 81.8%, and 0.55, 0.63, and 0.64, respectively. {T}hese methods were trained and tested on a nonredundant data set of 16 proteins, and performance was evaluated using "leave one out cross-validation" ({LOOCV}). {B}ased on this study, we have developed a {W}eb server, {TBBP}red, for predicting transmembrane beta-barrel regions in proteins (available at http://www.imtech.res.in/raghava/tbbpred).}, doi = {10.1002/prot.20092}, pdf = {../local/Natt2004Prediction.pdf}, file = {Natt2004Prediction.pdf:local/Natt2004Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.20092} }
@article{Nguyen2005Prediction, author = {Minh N Nguyen and Jagath C Rajapakse}, title = {Prediction of protein relative solvent accessibility with a two-stage {SVM} approach.}, journal = {Proteins}, year = {2005}, volume = {59}, pages = {30-7}, number = {1}, month = {Apr}, abstract = {Information on relative solvent accessibility ({RSA}) of amino acid residues in proteins provides valuable clues to the prediction of protein structure and function. {A} two-stage approach with support vector machines ({SVM}s) is proposed, where an {SVM} predictor is introduced to the output of the single-stage {SVM} approach to take into account the contextual relationships among solvent accessibilities for the prediction. {B}y using the position-specific scoring matrices ({PSSM}s) generated by {PSI}-{BLAST}, the two-stage {SVM} approach achieves accuracies up to 90.4\% and 90.2\% on the {M}anesh data set of 215 protein structures and the {RS}126 data set of 126 nonhomologous globular proteins, respectively, which are better than the highest published scores on both data sets to date. {A} {W}eb server for protein {RSA} prediction using a two-stage {SVM} method has been developed and is available (http://birc.ntu.edu.sg/~pas0186457/rsa.html).}, doi = {10.1002/prot.20404}, pdf = {../local/Nguyen2005Prediction.pdf}, file = {Nguyen2005Prediction.pdf:local/Nguyen2005Prediction.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1002/prot.20404} }
@article{Nguyen2005Two-stage, author = {M. N. Nguyen and J. C. Rajapakse}, title = {Two-stage multi-class support vector machines to protein secondary structure prediction.}, journal = {Pac {S}ymp {B}iocomput}, year = {2005}, pages = {346-57}, abstract = {Bioinformatics techniques to protein secondary structure ({PSS}) prediction are mostly single-stage approaches in the sense that they predict secondary structures of proteins by taking into account only the contextual information in amino acid sequences. {I}n this paper, we propose two-stage {M}ulti-class {S}upport {V}ector {M}achine ({MSVM}) approach where a {MSVM} predictor is introduced to the output of the first stage {MSVM} to capture the sequential relationship among secondary structure elements for the prediction. {B}y using position specific scoring matrices, generated by {PSI}-{BLAST}, the two-stage {MSVM} approach achieves {Q}3 accuracies of 78.0\% and 76.3\% on the {RS}126 dataset of 126 nonhomologous globular proteins and the {CB}396 dataset of 396 nonhomologous proteins, respectively, which are better than the highest scores published on both datasets to date.}, keywords = {biosvm} }
@article{Nguyen2003Multi-class, author = {Minh N Nguyen and Jagath C Rajapakse}, title = {Multi-class support vector machines for protein secondary structure prediction.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2003}, volume = {14}, pages = {218-27}, abstract = {The solution of binary classification problems using the {S}upport {V}ector {M}achine ({SVM}) method has been well developed. {T}hough multi-class classification is typically solved by combining several binary classifiers, recently, several multi-class methods that consider all classes at once have been proposed. {H}owever, these methods require resolving a much larger optimization problem and are applicable to small datasets. {T}hree methods based on binary classifications: one-against-all ({OAA}), one-against-one ({OAO}), and directed acyclic graph ({DAG}), and two approaches for multi-class problem by solving one single optimization problem, are implemented to predict protein secondary structure. {O}ur experiments indicate that multi-class {SVM} methods are more suitable for protein secondary structure ({PSS}) prediction than the other methods, including binary {SVM}s, because their capacity to solve an optimization problem in one step. {F}urthermore, in this paper, we argue that it is feasible to extend the prediction accuracy by adding a second-stage multi-class {SVM} to capture the contextual information among secondary structural elements and thereby further improving the accuracies. {W}e demonstrate that two-stage {SVM}s perform better than single-stage {SVM} techniques for {PSS} prediction using two datasets and report a maximum accuracy of 79.5\%.}, keywords = {biosvm} }
@incollection{Noble2004Support, author = {Noble, W. S.}, title = {Support vector machine applications in computational biology}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {71-92}, abstract = {During the past three years, the support vector machine learning algorithm has been extensively applied within the field of computational biology. {T}he algorithm has been used to detect patterns within and among biological sequences, to classify genes and patients based upon gene expression profiles, and has recently been applied to several new biological problems. {T}his chapter reviews the state of the art with respect to {SVM} applications in computational biology.}, pdf = {../local/Noble2004Support.pdf}, file = {Noble2004Support.pdf:local/Noble2004Support.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{ODonnell2005Gene, author = {Rebekah K O'Donnell and Michael Kupferman and S. Jack Wei and Sunil Singhal and Randal Weber and Bert O'Malley and Yi Cheng and Mary Putt and Michael Feldman and Barry Ziober and Ruth J Muschel}, title = {Gene expression signature predicts lymphatic metastasis in squamous cell carcinoma of the oral cavity.}, journal = {Oncogene}, year = {2005}, volume = {24}, pages = {1244-51}, number = {7}, month = {Feb}, abstract = {Metastasis via the lymphatics is a major risk factor in squamous cell carcinoma of the oral cavity ({OSCC}). {W}e sought to determine whether the presence of metastasis in the regional lymph node could be predicted by a gene expression signature of the primary tumor. {A} total of 18 {OSCC}s were characterized for gene expression by hybridizing {RNA} to {A}ffymetrix {U}133{A} gene chips. {G}enes with differential expression were identified using a permutation technique and verified by quantitative {RT}-{PCR} and immunohistochemistry. {A} predictive rule was built using a support vector machine, and the accuracy of the rule was evaluated using crossvalidation on the original data set and prediction of an independent set of four patients. {M}etastatic primary tumors could be differentiated from nonmetastatic primary tumors by a signature gene set of 116 genes. {T}his signature gene set correctly predicted the four independent patients as well as associating five lymph node metastases from the original patient set with the metastatic primary tumor group. {W}e concluded that lymph node metastasis could be predicted by gene expression profiles of primary oral cavity squamous cell carcinomas. {T}he presence of a gene expression signature for lymph node metastasis indicates that clinical testing to assess risk for lymph node metastasis should be possible.}, doi = {10.1038/sj.onc.1208285}, pdf = {../local/O'Donnell2005Gene.pdf}, file = {O'Donnell2005Gene.pdf:local/O'Donnell2005Gene.pdf:PDF}, keywords = {biosvm microarray}, pii = {1208285}, url = {http://dx.doi.org/10.1038/sj.onc.1208285} }
@article{OFlanagan2005Non, author = {R. A. O'Flanagan and G. Paillard and R. Lavery and A. M. Sengupta}, title = {Non-additivity in protein-{DNA} binding.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2254-63}, number = {10}, month = {May}, abstract = {M{OTIVATION}: {L}ocalizing protein binding sites within genomic {DNA} is of considerable importance, but remains difficult for protein families, such as transcription factors, which have loosely defined target sequences. {I}t is generally assumed that protein affinity for {DNA} involves additive contributions from successive nucleotide pairs within the target sequence. {T}his is not necessarily true, and non-additive effects have already been experimentally demonstrated in a small number of cases. {T}he principal origin of non-additivity involves the so-called indirect component of protein-{DNA} recognition which is related to the sequence dependence of {DNA} deformation induced during complex formation. {N}on-additive effects are difficult to study because they require the identification of many more binding sequences than are normally necessary for describing additive specificity (typically via the construction of weight matrices). {RESULTS}: {I}n the present work we will use theoretically estimated binding energies as a basis for overcoming this problem. {O}ur approach enables us to study the full combinatorial set of sequences for a variety of {DNA}-binding proteins, make a detailed analysis of non-additive effects and exploit this information to improve binding site predictions using either weight matrices or support vector machines. {T}he results underline the fact that, even in the presence of significant deformation, non-additive effects may involve only a limited number of dinucleotide steps. {T}his information helps to reduce the number of binding sites which need to be identified for successful predictions and to avoid problems of over-fitting. {AVAILABILITY}: {T}he {SVM} software is available upon request from the authors.}, doi = {10.1093/bioinformatics/bti361}, pdf = {../local/OFlanagan2005Non.pdf}, file = {OFlanagan2005Non.pdf:local/OFlanagan2005Non.pdf:PDF}, keywords = {biosvm}, pii = {bti361}, url = {http://dx.doi.org/10.1093/bioinformatics/bti361} }
@article{Pahikkala2005Contextual, author = {Tapio Pahikkala and Filip Ginter and Jorma Boberg and Jouni Jarvinen and Tapio Salakoski}, title = {Contextual weighting for {S}upport {V}ector {M}achines in literature mining: an application to gene versus protein name disambiguation.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6}, pages = {157}, number = {1}, month = {Jun}, abstract = {B{ACKGROUND}: {T}he ability to distinguish between genes and proteins is essential for understanding biological text. {S}upport {V}ector {M}achines ({SVM}s) have been proven to be very efficient in general data mining tasks. {W}e explore their capability for the gene versus protein name disambiguation task. {RESULTS}: {W}e incorporated into the conventional {SVM} a weighting scheme based on distances of context words from the word to be disambiguated. {T}his weighting scheme increased the performance of {SVM}s by five percentage points giving performance better than 85\% as measured by the area under {ROC} curve and outperformed the {W}eighted {A}dditive {C}lassifier, which also incorporates the weighting, and the {N}aive {B}ayes classifier. {CONCLUSIONS}: {W}e show that the performance of {SVM}s can be improved by the proposed weighting scheme. {F}urthermore, our results suggest that in this study the increase of the classification performance due to the weighting is greater than that obtained by selecting the underlying classifier or the kernel part of the {SVM}.}, doi = {10.1186/1471-2105-6-157}, pdf = {../local/Pahikkala2005Contextual.pdf}, file = {Pahikkala2005Contextual.pdf:local/Pahikkala2005Contextual.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-157}, url = {http://dx.doi.org/10.1186/1471-2105-6-157} }
@article{Pan2004Comprehensive, author = {Fei Pan and Baoying Wang and Xin Hu and William Perrizo}, title = {Comprehensive vertical sample-based {KNN}/{LSVM} classification for gene expression analysis.}, journal = {J {B}iomed {I}nform}, year = {2004}, volume = {37}, pages = {240-8}, number = {4}, month = {Aug}, abstract = {Classification analysis of microarray gene expression data has been widely used to uncover biological features and to distinguish closely related cell types that often appear in the diagnosis of cancer. {H}owever, the number of dimensions of gene expression data is often very high, e.g., in the hundreds or thousands. {A}ccurate and efficient classification of such high-dimensional data remains a contemporary challenge. {I}n this paper, we propose a comprehensive vertical sample-based {KNN}/{LSVM} classification approach with weights optimized by genetic algorithms for high-dimensional data. {E}xperiments on common gene expression datasets demonstrated that our approach can achieve high accuracy and efficiency at the same time. {T}he improvement of speed is mainly related to the vertical data representation, {P}-tree,{P}atents are pending on the {P}-tree technology. {T}his work is partially supported by {GSA} {G}rant {ACT}#:{K}96130308. and its optimized logical algebra. {T}he high accuracy is due to the combination of a {KNN} majority voting approach and a local support vector machine approach that makes optimal decisions at the local level. {A}s a result, our approach could be a powerful tool for high-dimensional gene expression data analysis.}, doi = {10.1016/j.jbi.2004.07.003}, pdf = {../local/Pan2004Comprehensive.pdf}, file = {Pan2004Comprehensive.pdf:local/Pan2004Comprehensive.pdf:PDF}, keywords = {biosvm}, pii = {S1532-0464(04)00070-X}, url = {http://dx.doi.org/10.1016/j.jbi.2004.07.003} }
@article{Park2003Prediction, author = {Park, K.-J. and Kanehisa, M.}, title = {Prediction of protein subcellular locations by support vector machines using compositions of amino acids and amino acid pairs}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1656-1663}, number = {13}, abstract = {Motivation: {T}he subcellular location of a protein is closely correlated to its function. {T}hus, computational prediction of subcellular locations from the amino acid sequence information would help annotation and functional prediction of protein coding genes in complete genomes. {W}e have developed a method based on support vector machines ({SVM}s). {R}esults: {W}e considered 12 subcellular locations in eukaryotic cells: chloroplast, cytoplasm, cytoskeleton, endoplasmic reticulum, extracellular medium, {G}olgi apparatus, lysosome, mitochondrion, nucleus, peroxisome, plasma membrane, and vacuole. {W}e constructed a data set of proteins with known locations from the {SWISS}-{PROT} database. {A} set of {SVM}s was trained to predict the subcellular location of a given protein based on its amino acid, amino acid pair, and gapped amino acid pair compositions. {T}he predictors based on these different compositions were then combined using a voting scheme. {R}esults obtained through 5-fold cross-validation tests showed an improvement in prediction accuracy over the algorithm based on the amino acid composition only. {T}his prediction method is available via the {I}nternet. {A}vailability: http://www.genome.ad.jp/{SIT}/ploc.html {S}upplementary information: http://web.kuicr.kyoto-u.ac.jp/~park/{S}eqdata/}, pdf = {../local/Park2003Prediction.pdf}, file = {Park2003Prediction.pdf:local/Park2003Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1656} }
@article{Passerini2004Learning, author = {Passerini, A. and Frasconi, P.}, title = {Learning to discriminate between ligand-bound and disulfide-bound cysteines}, journal = {Protein {E}ng. {D}es. {S}el.}, year = {2004}, volume = {17}, pages = {367-373}, number = {4}, abstract = {We present a machine learning method to discriminate between cysteines involved in ligand binding and cysteines forming disulfide bridges. {O}ur method uses a window of multiple alignment profiles to represent each instance and support vector machines with a polynomial kernel as the learning algorithm. {W}e also report results obtained with two new kernel functions based on similarity matrices. {E}xperimental results indicate that binding type can be predicted at significantly higher accuracy than using {PROSITE} patterns.}, doi = {10.1093/protein/gzh042}, pdf = {../local/Passerini2004Learning.pdf}, file = {Passerini2004Learning.pdf:local/Passerini2004Learning.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/protein/gzh042} }
@inproceedings{Patterson2002Pre-mRNA, author = {Patterson, D.J. and Yasuhara, K. and Ruzzo, W.L.}, title = {Pre-{m{RNA}} secondary structure prediction aids splice site prediction.}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002}, year = {2002}, editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin Lauerdale and Teri E. Klein}, pages = {223-234}, publisher = {World Scientific}, abstract = {Accurate splice site prediction is a critical component of any computational approach to gene prediction in higher organisms. {E}xisting approaches generally use sequence-based models that capture local dependencies among nucleotides in a small window around the splice site. {W}e present evidence that computationally predicted secondary structure of moderate length pre-m{RNA} subsequencies contains information that can be exploited to improve acceptor splice site prediction beyond that possible with conventional sequence-based approaches. {B}oth decision tree and support vector machine classifiers, using folding energy and structure metrics characterizing helix formation near the splice site, achieve a 5-10% reduction in error rate with a human data set. {B}ased on our data, we hypothesize that acceptors preferentially exhibit short helices at the splice site.}, pdf = {../local/Patterson2002Pre-mRNA.pdf}, file = {Patterson2002Pre-mRNA.pdf:local/Patterson2002Pre-mRNA.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.smi.stanford.edu/projects/helix/psb02/patterson.pdf} }
@article{Pavey2004Microarray, author = {Pavey, S. and Johansson, P. and Packer, L. and Taylor, J. and Stark, M. and Pollock, P.M. and Walker, G.J. and Boyle, G.M. and Harper, U. and Cozzi, S.J. and Hansen, K. and Yudt, L. and Schmidt, C. and Hersey, P. and Ellem, K.A. and O'Rourke, M.G. and Parsons, P.G. and Meltzer, P. and Ringner, M. and Hayward, N.K.}, title = {Microarray expression profiling in melanoma reveals a {BRAF} mutation signature}, journal = {Oncogene}, year = {2004}, volume = {23}, pages = {4060-4067}, number = {23}, month = {May}, abstract = {We have used microarray gene expression profiling and machine learning to predict the presence of {BRAF} mutations in a panel of 61 melanoma cell lines. {T}he {BRAF} gene was found to be mutated in 42 samples (69%) and intragenic mutations of the {NRAS} gene were detected in seven samples (11%). {N}o cell line carried mutations of both genes. {U}sing support vector machines, we have built a classifier that differentiates between melanoma cell lines based on {BRAF} mutation status. {A}s few as 83 genes are able to discriminate between {BRAF} mutant and {BRAF} wild-type samples with clear separation observed using hierarchical clustering. {M}ultidimensional scaling was used to visualize the relationship between a {BRAF} mutation signature and that of a generalized mitogen-activated protein kinase ({MAPK}) activation (either {BRAF} or {NRAS} mutation) in the context of the discriminating gene list. {W}e observed that samples carrying {NRAS} mutations lie somewhere between those with or without {BRAF} mutations. {T}hese observations suggest that there are gene-specific mutation signals in addition to a common {MAPK} activation that result from the pleiotropic effects of either {BRAF} or {NRAS} on other signaling pathways, leading to measurably different transcriptional changes.}, doi = {10.1038/sj.onc.1207563}, pdf = {../local/Pavey2004Microarray.pdf}, file = {Pavey2004Microarray.pdf:local/Pavey2004Microarray.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1038/sj.onc.1207563} }
@inproceedings{Pavlidis2001Promoter, author = {P. Pavlidis and T. S. Furey and M. Liberto and D. Haussler and W. N. Grundy}, title = {Promoter {R}egion-{B}ased {C}lassification of {G}enes}, booktitle = {Pacific {S}ymposium on {B}iocomputing}, year = {2001}, pages = {139--150}, pdf = {../local/pavl01b.pdf}, file = {pavl01b.pdf:local/pavl01b.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.smi.stanford.edu/projects/helix/psb01/pavlidis.pdf} }
@inproceedings{Pavlidis2001Gene, author = {Pavlidis, P. and Weston, J. and Cai, J. and Grundy, W.N.}, title = {Gene functional classification from heterogeneous data}, booktitle = {Proceedings of the {F}ifth {A}nnual {I}nternational {C}onference on {C}omputational {B}iology}, year = {2001}, pages = {249--255}, pdf = {../local/pavl01.pdf}, file = {pavl01.pdf:local/pavl01.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.cs.columbia.edu/compbio/papers/exp-phylo.pdf} }
@article{Pavlidis2002Learning, author = {Pavlidis, P. and Weston, J. and Cai, J. and Noble, W.S.}, title = {Learning Gene Functional Classifications from Multiple Data Types}, journal = {J. Comput. Biol.}, year = {2002}, volume = {9}, pages = {401--411}, number = {2}, abstract = {In our attempts to understand cellular function at the molecular level, we must be able to synthesize information from disparate types of genomic data. {W}e consider the problem of inferring gene functional classifications from a heterogeneous data set consisting of {DNA} microarray expression measurements and phylogenetic profiles from whole-genome sequence comparisons. {W}e demonstrate the application of the support vector machine ({SVM}) learning algorithm to this functional inference task. {O}ur results suggest the importance of exploiting prior information about the heterogeneity of the data. {I}n particular, we propose an {SVM} kernel function that is explicitly heterogeneous. {I}n addition, we describe feature scaling methods for further exploiting prior knowledge of heterogeneity by giving each data type different weights.}, doi = {10.1089/10665270252935539}, pdf = {../local/Pavlidis2002Learning.pdf}, file = {Pavlidis2002Learning.pdf:local/Pavlidis2002Learning.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Peng2003Molecular, author = {Peng, S. and Xu, Q. and Ling, X.B. and Peng, X. and Du, W. and Chen, L.}, title = {Molecular classification of cancer types from microarray data using the combination of genetic algorithms and support vector machines.}, journal = {F{EBS} {L}ett.}, year = {2003}, volume = {555}, pages = {358-362}, number = {2}, abstract = {Simultaneous multiclass classification of tumor types is essential for future clinical implementations of microarray-based cancer diagnosis. {I}n this study, we have combined genetic algorithms ({GA}s) and all paired support vector machines ({SVM}s) for multiclass cancer identification. {T}he predictive features have been selected through iterative {SVM}s/{GA}s, and recursive feature elimination post-processing steps, leading to a very compact cancer-related predictive gene set. {L}eave-one-out cross-validations yielded accuracies of 87.93% for the eight-class and 85.19% for the fourteen-class cancer classifications, outperforming the results derived from previously published methods.}, doi = {10.1016/S0014-5793(03)01275-4}, pdf = {../local/Peng2003Molecular.pdf}, file = {Peng2003Molecular.pdf:local/Peng2003Molecular.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0014-5793(03)01275-4} }
@article{Pham2005Support, author = {Tho Hoan Pham and Kenji Satou and Tu Bao Ho}, title = {Support vector machines for prediction and analysis of beta and gamma-turns in proteins.}, journal = {J. {B}ioinform. {C}omput. {B}iol.}, year = {2005}, volume = {3}, pages = {343-58}, number = {2}, month = {Apr}, abstract = {Tight turns have long been recognized as one of the three important features of proteins, together with alpha-helix and beta-sheet. {T}ight turns play an important role in globular proteins from both the structural and functional points of view. {M}ore than 90\% tight turns are beta-turns and most of the rest are gamma-turns. {A}nalysis and prediction of beta-turns and gamma-turns is very useful for design of new molecules such as drugs, pesticides, and antigens. {I}n this paper we investigated two aspects of applying support vector machine ({SVM}), a promising machine learning method for bioinformatics, to prediction and analysis of beta-turns and gamma-turns. {F}irst, we developed two {SVM}-based methods, called {BTSVM} and {GTSVM}, which predict beta-turns and gamma-turns in a protein from its sequence. {W}hen compared with other methods, {BTSVM} has a superior performance and {GTSVM} is competitive. {S}econd, we used {SVM}s with a linear kernel to estimate the support of amino acids for the formation of beta-turns and gamma-turns depending on their position in a protein. {O}ur analysis results are more comprehensive and easier to use than the previous results in designing turns in proteins.}, keywords = {biosvm}, pii = {S0219720005001089} }
@article{Pham2003Prediction, author = {Tho Hoan Pham and Kenji Satou and Tu Bao Ho}, title = {Prediction and analysis of beta-turns in proteins by support vector machine.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2003}, volume = {14}, pages = {196-205}, abstract = {Tight turn has long been recognized as one of the three important features of proteins after the alpha-helix and beta-sheet. {T}ight turns play an important role in globular proteins from both the structural and functional points of view. {M}ore than 90\% tight turns are beta-turns. {A}nalysis and prediction of beta-turns in particular and tight turns in general are very useful for the design of new molecules such as drugs, pesticides, and antigens. {I}n this paper, we introduce a support vector machine ({SVM}) approach to prediction and analysis of beta-turns. {W}e have investigated two aspects of applying {SVM} to the prediction and analysis of beta-turns. {F}irst, we developed a new {SVM} method, called {BTSVM}, which predicts beta-turns of a protein from its sequence. {T}he prediction results on the dataset of 426 non-homologous protein chains by sevenfold cross-validation technique showed that our method is superior to the other previous methods. {S}econd, we analyzed how amino acid positions support (or prevent) the formation of beta-turns based on the "multivariable" classification model of a linear {SVM}. {T}his model is more general than the other ones of previous statistical methods. {O}ur analysis results are more comprehensive and easier to use than previously published analysis results.}, keywords = {biosvm} }
@article{Plewczyski2005support, author = {Dariusz Plewczynski and Adrian Tkacz and Adam Godzik and Leszek Rychlewski}, title = {A support vector machine approach to the identification of phosphorylation sites.}, journal = {Cell {M}ol {B}iol {L}ett}, year = {2005}, volume = {10}, pages = {73-89}, number = {1}, abstract = {We describe a bioinformatics tool that can be used to predict the position of phosphorylation sites in proteins based only on sequence information. {T}he method uses the support vector machine ({SVM}) statistical learning theory. {T}he statistical models for phosphorylation by various types of kinases are built using a dataset of short (9-amino acid long) sequence fragments. {T}he sequence segments are dissected around post-translationally modified sites of proteins that are on the current release of the {S}wiss-{P}rot database, and that were experimentally confirmed to be phosphorylated by any kinase. {W}e represent them as vectors in a multidimensional abstract space of short sequence fragments. {T}he prediction method is as follows. {F}irst, a given query protein sequence is dissected into overlapping short segments. {A}ll the fragments are then projected into the multidimensional space of sequence fragments via a collection of different representations. {T}hose points are classified with pre-built statistical models (the {SVM} method with linear, polynomial and radial kernel functions) either as phosphorylated or inactive ones. {T}he resulting list of plausible sites for phosphorylation by various types of kinases in the query protein is returned to the user. {T}he efficiency of the method for each type of phosphorylation is estimated using leave-one-out tests and presented here. {T}he sensitivities of the models can reach over 70\%, depending on the type of kinase. {T}he additional information from profile representations of short sequence fragments helps in gaining a higher degree of accuracy in some phosphorylation types. {T}he further development of an automatic phosphorylation site annotation predictor based on our algorithm should yield a significant improvement when using statistical algorithms in order to quantify the results.}, pdf = {../local/Plewczyski2005support.pdf}, file = {Plewczyski2005support.pdf:local/Plewczyski2005support.pdf:PDF}, keywords = {biosvm} }
@article{Plewczynski2005AutoMotif, author = {Dariusz Plewczynski and Adrian Tkacz and Lucjan Stanislaw Wyrwicz and Leszek Rychlewski}, title = {Auto{M}otif server: prediction of single residue post-translational modifications in proteins.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2525-7}, number = {10}, month = {May}, abstract = {The {A}uto{M}otif {S}erver allows for identification of post-translational modification ({PTM}) sites in proteins based only on local sequence information. {T}he local sequence preferences of short segments around {PTM} residues are described here as linear functional motifs ({LFM}s). {S}equence models for all types of {PTM}s are trained by support vector machine on short-sequence fragments of proteins in the current release of {S}wiss-{P}rot database (phosphorylation by various protein kinases, sulfation, acetylation, methylation, amidation, etc.). {T}he accuracy of the identification is estimated using the standard leave-one-out procedure. {T}he sensitivities for all types of short {LFM}s are in the range of 70\%. {AVAILABILITY}: {T}he {A}uto{M}otif {S}erver is available free for academic use at http://automotif.bioinfo.pl/}, doi = {10.1093/bioinformatics/bti333}, pdf = {../local/Plewczynski2005AutoMotif.pdf}, file = {Plewczynski2005AutoMotif.pdf:local/Plewczynski2005AutoMotif.pdf:PDF}, keywords = {biosvm}, pii = {bti333}, url = {http://dx.doi.org/10.1093/bioinformatics/bti333} }
@article{Pochet2004Systematic, author = {Pochet, N. and De Smet, F. and Suykens, J. A. K. and De Moor, B. L. R.}, title = {Systematic benchmarking of microarray data classification: assessing the role of non-linearity and dimensionality reduction}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {3185-3195}, number = {17}, month = {Nov}, abstract = {Motivation: {M}icroarrays are capable of determining the expression levels of thousands of genes simultaneously. {I}n combination with classification methods, this technology can be useful to support clinical management decisions for individual patients, e.g. in oncology. {T}he aim of this paper is to systematically benchmark the role of non-linear versus linear techniques and dimensionality reduction methods. {R}esults: {A} systematic benchmarking study is performed by comparing linear versions of standard classification and dimensionality reduction techniques with their non-linear versions based on non-linear kernel functions with a radial basis function ({RBF}) kernel. {A} total of 9 binary cancer classification problems, derived from 7 publicly available microarray datasets, and 20 randomizations of each problem are examined. {C}onclusions: {T}hree main conclusions can be formulated based on the performances on independent test sets. (1) {W}hen performing classification with least squares support vector machines ({LS}-{SVM}s) (without dimensionality reduction), {RBF} kernels can be used without risking too much overfitting. {T}he results obtained with well-tuned {RBF} kernels are never worse and sometimes even statistically significantly better compared to results obtained with a linear kernel in terms of test set receiver operating characteristic and test set accuracy performances. (2) {E}ven for classification with linear classifiers like {LS}-{SVM} with linear kernel, using regularization is very important. (3) {W}hen performing kernel principal component analysis (kernel {PCA}) before classification, using an {RBF} kernel for kernel {PCA} tends to result in overfitting, especially when using supervised feature selection. {I}t has been observed that an optimal selection of a large number of features is often an indication for overfitting. {K}ernel {PCA} with linear kernel gives better results. {A}vailability: {M}atlab scripts are available on request. {S}upplementary information: http://www.esat.kuleuven.ac.be/~npochet/{B}ioinformatics/}, doi = {10.1093/bioinformatics/bth383}, pdf = {../local/Pochet2004Systematic.pdf}, file = {Pochet2004Systematic.pdf:local/Pochet2004Systematic.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/bioinformatics/bth383} }
@article{Prados2004Mining, author = {Prados, J. and Kalousis, A. and Sanchez, J.C. and Allard, L. and Carrette, O. and Hilario, M.}, title = {Mining mass spectra for diagnosis and biomarker discovery of cerebral accidents.}, journal = {Proteomics}, year = {2004}, volume = {4}, pages = {2320-2332}, number = {8}, abstract = {In this paper we try to identify potential biomarkers for early stroke diagnosis using surface-enhanced laser desorption/ionization mass spectrometry coupled with analysis tools from machine learning and data mining. {D}ata consist of 42 specimen samples, i.e., mass spectra divided in two big categories, stroke and control specimens. {A}mong the stroke specimens two further categories exist that correspond to ischemic and hemorrhagic stroke; in this paper we limit our data analysis to discriminating between control and stroke specimens. {W}e performed two suites of experiments. {I}n the first one we simply applied a number of different machine learning algorithms; in the second one we have chosen the best performing algorithm as it was determined from the first phase and coupled it with a number of different feature selection methods. {T}he reason for this was 2-fold, first to establish whether feature selection can indeed improve performance, which in our case it did not seem to confirm, but more importantly to acquire a small list of potentially interesting biomarkers. {O}f the different methods explored the most promising one was support vector machines which gave us high levels of sensitivity and specificity. {F}inally, by analyzing the models constructed by support vector machines we produced a small set of 13 features that could be used as potential biomarkers, and which exhibited good performance both in terms of sensitivity, specificity and model stability.}, doi = {10.1002/pmic.200400857}, pdf = {../local/Prados2004Mining.pdf}, file = {Prados2004Mining.pdf:local/Prados2004Mining.pdf:PDF}, keywords = {biosvm proteomics}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/pmic.200400857} }
@article{Qian2003Prediction, author = {Qian, J. and Lin, J. and Luscombe, N. M. and Yu, H. and Gerstein, M.}, title = {Prediction of regulatory networks: genome-wide identification of transcription factor targets from gene expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1917-1926}, number = {15}, abstract = {Motivation: {D}efining regulatory networks, linking transcription factors ({TF}s) to their targets, is a central problem in post-genomic biology. {O}ne might imagine one could readily determine these networks through inspection of gene expression data. {H}owever, the relationship between the expression timecourse of a transcription factor and its target is not obvious (e.g. simple correlation over the timecourse), and current analysis methods, such as hierarchical clustering, have not been very successful in deciphering them. {R}esults: {H}ere we introduce an approach based on support vector machines ({SVM}s) to predict the targets of a transcription factor by identifying subtle relationships between their expression profiles. {I}n particular, we used {SVM}s to predict the regulatory targets for 36 transcription factors in the {S}accharomyces cerevisiae genome based on the microarray expression data from many different physiological conditions. {W}e trained and tested our {SVM} on a data set constructed to include a significant number of both positive and negative examples, directly addressing data imbalance issues. {T}his was non-trivial given that most of the known experimental information is only for positives. {O}verall, we found that 63% of our {TF}-target relationships were confirmed through cross-validation. {W}e further assessed the performance of our regulatory network identifications by comparing them with the results from two recent genome-wide {C}h{IP}-chip experiments. {O}verall, we find the agreement between our results and these experiments is comparable to the agreement (albeit low) between the two experiments. {W}e find that this network has a delocalized structure with respect to chromosomal positioning, with a given transcription factor having targets spread fairly uniformly across the genome. {A}vailability: {T}he overall network of the relationships is available on the web at http://bioinfo.mbb.yale.edu/expression/echipchip}, pdf = {../local/Qian2003Prediction.pdf}, file = {Qian2003Prediction.pdf:local/Qian2003Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1917} }
@article{Qin2003Kernel, author = {Qin, J. and Lewis, D. P. and Noble, W. S.}, title = {Kernel hierarchical gene clustering from microarray expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {2097-2104}, number = {16}, abstract = {Motivation: {U}nsupervised analysis of microarray gene expression data attempts to find biologically significant patterns within a given collection of expression measurements. {F}or example, hierarchical clustering can be applied to expression profiles of genes across multiple experiments, identifying groups of genes that share similiar expression profiles. {P}revious work using the support vector machine supervised learning algorithm with microarray data suggests that higher-order features, such as pairwise and tertiary correlations across multiple experiments, may provide significant benefit in learning to recognize classes of co-expressed genes. {R}esults: {W}e describe a generalization of the hierarchical clustering algorithm that efficiently incorporates these higher-order features by using a kernel function to map the data into a high-dimensional feature space. {W}e then evaluate the utility of the kernel hierarchical clustering algorithm using both internal and external validation. {T}he experiments demonstrate that the kernel representation itself is insufficient to provide improved clustering performance. {W}e conclude that mapping gene expression data into a high-dimensional feature space is only a good idea when combined with a learning algorithm, such as the support vector machine that does not suffer from the curse of dimensionality. {A}vailability: {S}upplementary data at www.cs.columbia.edu/compbio/hiclust. {S}oftware source code available by request.}, pdf = {../local/Qin2003Kernel.pdf}, file = {Qin2003Kernel.pdf:local/Qin2003Kernel.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/16/2097} }
@article{Raghava2005Correlation, author = {Gajendra P S Raghava and Joon H Han}, title = {Correlation and prediction of gene expression level from amino acid and dipeptide composition of its protein.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6}, pages = {59}, number = {1}, month = {Mar}, abstract = {B{ACKGROUND}: {A} large number of papers have been published on analysis of microarray data with particular emphasis on normalization of data, detection of differentially expressed genes, clustering of genes and regulatory network. {O}n other hand there are only few studies on relation between expression level and composition of nucleotide/protein sequence, using expression data. {T}here is a need to understand why particular genes/proteins express more in particular conditions. {I}n this study, we analyze 3468 genes of {S}accharomyces cerevisiae obtained from {H}olstege et al., (1998) to understand the relationship between expression level and amino acid composition. {RESULTS}: {W}e compute the correlation between expression of a gene and amino acid composition of its protein. {I}t was observed that some residues (like {A}la, {G}ly, {A}rg and {V}al) have significant positive correlation (r > 0.20) and some other residues ({L}ike {A}sp, {L}eu, {A}sn and {S}er) have negative correlation (r < -0.15) with the expression of genes. {A} significant negative correlation (r = -0.18) was also found between length and gene expression. {T}hese observations indicate the relationship between percent composition and gene expression level. {T}hus, attempts have been made to develop a {S}upport {V}ector {M}achine ({SVM}) based method for predicting the expression level of genes from its protein sequence. {I}n this method the {SVM} is trained with proteins whose gene expression data is known in a given condition. {T}hen trained {SVM} is used to predict the gene expression of other proteins of the same organism in the same condition. {A} correlation coefficient r = 0.70 was obtained between predicted and experimentally determined expression of genes, which improves from r = 0.70 to 0.72 when dipeptide composition was used instead of residue composition. {T}he method was evaluated using 5-fold cross validation test. {W}e also demonstrate that amino acid composition information along with gene expression data can be used for improving the function classification of proteins. {CONCLUSION}: {T}here is a correlation between gene expression and amino acid composition that can be used to predict the expression level of genes up to a certain extent. {A} web server based on the above strategy has been developed for calculating the correlation between amino acid composition and gene expression and prediction of expression level http://kiwi.postech.ac.kr/raghava/lgepred/. {T}his server will allow users to study the evolution from expression data.}, doi = {10.1186/1471-2105-6-59}, keywords = {biosvm}, pii = {1471-2105-6-59}, url = {http://dx.doi.org/10.1186/1471-2105-6-59} }
@article{Ramaswamy2001Multiclass, author = {Ramaswamy, S. and Tamayo, P. and Rifkin, R. and Mukherjee, S. and Yeang, C.H. and Angelo, M. and Ladd, C. and Reich, M. and Latulippe, E. and Mesirov, J.P. and Poggio, T. and Gerald, W. and Loda, M. and Lander, E.S. and Golub, T.R.}, title = {Multiclass cancer diagnosis using tumor gene expression signatures}, journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}}, year = {2001}, volume = {98}, pages = {15149-15154}, number = {26}, month = {Dec}, abstract = {The optimal treatment of patients with cancer depends on establishing accurate diagnoses by using a complex combination of clinical and histopathological data. {I}n some instances, this task is difficult or impossible because of atypical clinical presentation or histopathology. {T}o determine whether the diagnosis of multiple common adult malignancies could be achieved purely by molecular classification, we subjected 218 tumor samples, spanning 14 common tumor types, and 90 normal tissue samples to oligonucleotide microarray gene expression analysis. {T}he expression levels of 16,063 genes and expressed sequence tags were used to evaluate the accuracy of a multiclass classifier based on a support vector machine algorithm. {O}verall classification accuracy was 78%, far exceeding the accuracy of random classification (9%). {P}oorly differentiated cancers resulted in low-confidence predictions and could not be accurately classified according to their tissue of origin, indicating that they are molecularly distinct entities with dramatically different gene expression patterns compared with their well differentiated counterparts. {T}aken together, these results demonstrate the feasibility of accurate, multiclass molecular cancer classification and suggest a strategy for future clinical implementation of molecular cancer diagnostics.}, doi = {10.1073/pnas.211566398}, pdf = {../local/Ramaswamy2001Multiclass.pdf}, file = {Ramaswamy2001Multiclass.pdf:local/Ramaswamy2001Multiclass.pdf:PDF}, keywords = {biosvm microarray}, owner = {vert}, url = {http://dx.doi.org/10.1073/pnas.211566398} }
@article{Rangwala2005Profile-based, author = {Rangwala, H. and Karypis, G.}, title = {Profile-based direct kernels for remote homology detection and fold recognition.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {4239--4247}, number = {23}, month = {Dec}, abstract = {MOTIVATION: Protein remote homology detection is a central problem in computational biology. Supervised learning algorithms based on support vector machines are currently one of the most effective methods for remote homology detection. The performance of these methods depends on how the protein sequences are modeled and on the method used to compute the kernel function between them. RESULTS: We introduce two classes of kernel functions that are constructed by combining sequence profiles with new and existing approaches for determining the similarity between pairs of protein sequences. These kernels are constructed directly from these explicit protein similarity measures and employ effective profile-to-profile scoring schemes for measuring the similarity between pairs of proteins. Experiments with remote homology detection and fold recognition problems show that these kernels are capable of producing results that are substantially better than those produced by all of the existing state-of-the-art SVM-based methods. In addition, the experiments show that these kernels, even when used in the absence of profiles, produce results that are better than those produced by existing non-profile-based schemes. AVAILABILITY: The programs for computing the various kernel functions are available on request from the authors.}, doi = {10.1093/bioinformatics/bti687}, keywords = {biosvm}, owner = {vert}, pii = {bti687}, pmid = {16188929}, timestamp = {2007.08.01}, url = {http://dx.doi.org/10.1093/bioinformatics/bti687} }
@article{Rensing2005Protein, author = {Stefan A Rensing and Dana Fritzowsky and Daniel Lang and Ralf Reski}, title = {Protein encoding genes in an ancient plant: analysis of codon usage, retained genes and splice sites in a moss, {P}hyscomitrella patens.}, journal = {B{MC} {G}enomics}, year = {2005}, volume = {6}, pages = {43}, number = {1}, month = {Mar}, abstract = {B{ACKGROUND}: {T}he moss {P}hyscomitrella patens is an emerging plant model system due to its high rate of homologous recombination, haploidy, simple body plan, physiological properties as well as phylogenetic position. {A}vailable {EST} data was clustered and assembled, and provided the basis for a genome-wide analysis of protein encoding genes. {RESULTS}: {W}e have clustered and assembled {P}hyscomitrella patens {EST} and {CDS} data in order to represent the transcriptome of this non-seed plant. {C}lustering of the publicly available data and subsequent prediction resulted in a total of 19,081 non-redundant {ORF}. {O}f these putative transcripts, approximately 30\% have a homolog in both rice and {A}rabidopsis transcriptome. {M}ore than 130 transcripts are not present in seed plants but can be found in other kingdoms. {T}hese potential "retained genes" might have been lost during seed plant evolution. {F}unctional annotation of these genes reveals unequal distribution among taxonomic groups and intriguing putative functions such as cytotoxicity and nucleic acid repair. {W}hereas introns in the moss are larger on average than in the seed plant {A}rabidopsis thaliana, position and amount of introns are approximately the same. {C}ontrary to {A}rabidopsis, where {CDS} contain on average 44\% {G}/{C}, in {P}hyscomitrella the average {G}/{C} content is 50\%. {I}nterestingly, moss orthologs of {A}rabidopsis genes show a significant drift of codon fraction usage, towards the seed plant. {W}hile averaged codon bias is the same in {P}hyscomitrella and {A}rabidopsis, the distribution pattern is different, with 15\% of moss genes being unbiased. {S}pecies-specific, sensitive and selective splice site prediction for {P}hyscomitrella has been developed using a dataset of 368 donor and acceptor sites, utilizing a support vector machine. {T}he prediction accuracy is better than those achieved with tools trained on {A}rabidopsis data. {CONCLUSION}: {A}nalysis of the moss transcriptome displays differences in gene structure, codon and splice site usage in comparison with the seed plant {A}rabidopsis. {P}utative retained genes exhibit possible functions that might explain the peculiar physiological properties of mosses. {B}oth the transcriptome representation (including a {BLAST} and retrieval service) and splice site prediction have been made available on http://www.cosmoss.org, setting the basis for assembly and annotation of the {P}hyscomitrella genome, of which draft shotgun sequences will become available in 2005.}, doi = {10.1186/1471-2164-6-43}, pdf = {../local/Rensing2005Protein.pdf}, file = {Rensing2005Protein.pdf:local/Rensing2005Protein.pdf:PDF}, keywords = {biosvm}, pii = {1471-2164-6-43}, url = {http://dx.doi.org/10.1186/1471-2164-6-43} }
@article{Res2005evolution, author = {I. Res and I. Mihalek and O. Lichtarge}, title = {An evolution based classifier for prediction of protein interfaces without using protein structures.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2496-501}, number = {10}, month = {May}, abstract = {M{OTIVATION}: {T}he number of available protein structures still lags far behind the number of known protein sequences. {T}his makes it important to predict which residues participate in protein-protein interactions using only sequence information. {F}ew studies have tackled this problem until now. {RESULTS}: {W}e applied support vector machines to sequences in order to generate a classification of all protein residues into those that are part of a protein interface and those that are not. {F}or the first time evolutionary information was used as one of the attributes and this inclusion of evolutionary importance rankings improves the classification. {L}eave-one-out cross-validation experiments show that prediction accuracy reaches 64\%.}, doi = {10.1093/bioinformatics/bti340}, pdf = {../local/Res2005evolution.pdf}, file = {Res2005evolution.pdf:local/Res2005evolution.pdf:PDF}, keywords = {biosvm}, pii = {bti340}, url = {http://dx.doi.org/10.1093/bioinformatics/bti340} }
@article{Rice2005Mining, author = {Simon B Rice and Goran Nenadic and Benjamin J Stapley}, title = {Mining protein function from text using term-based support vector machines.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6 Suppl 1}, pages = {S22}, abstract = {B{ACKGROUND}: {T}ext mining has spurred huge interest in the domain of biology. {T}he goal of the {B}io{C}re{A}t{I}v{E} exercise was to evaluate the performance of current text mining systems. {W}e participated in {T}ask 2, which addressed assigning {G}ene {O}ntology terms to human proteins and selecting relevant evidence from full-text documents. {W}e approached it as a modified form of the document classification task. {W}e used a supervised machine-learning approach (based on support vector machines) to assign protein function and select passages that support the assignments. {A}s classification features, we used a protein's co-occurring terms that were automatically extracted from documents. {RESULTS}: {T}he results evaluated by curators were modest, and quite variable for different problems: in many cases we have relatively good assignment of {GO} terms to proteins, but the selected supporting text was typically non-relevant (precision spanning from 3\% to 50\%). {T}he method appears to work best when a substantial set of relevant documents is obtained, while it works poorly on single documents and/or short passages. {T}he initial results suggest that our approach can also mine annotations from text even when an explicit statement relating a protein to a {GO} term is absent. {CONCLUSION}: {A} machine learning approach to mining protein function predictions from text can yield good performance only if sufficient training data is available, and significant amount of supporting data is used for prediction. {T}he most promising results are for combined document retrieval and {GO} term assignment, which calls for the integration of methods developed in {B}io{C}re{A}t{I}v{E} {T}ask 1 and {T}ask 2.}, doi = {10.1186/1471-2105-6-S1-S22}, pdf = {../local/Rice2005Mining.pdf}, file = {Rice2005Mining.pdf:local/Rice2005Mining.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-S1-S22}, url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S22} }
@article{Riedesel2004Peptide, author = {Henning Riedesel and Björn Kolbeck and Oliver Schmetzer and Ernst-Walter Knapp}, title = {Peptide binding at class {I} major histocompatibility complex scored with linear functions and support vector machines.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2004}, volume = {15}, pages = {198-212}, number = {1}, abstract = {We explore two different methods to predict the binding ability of nonapeptides at the class {I} major histocompatibility complex using a general linear scoring function that defines a separating hyperplane in the feature space of sequences. {I}n absence of suitable data on non-binding nonapeptides we generated sequences randomly from a selected set of proteins from the protein data bank. {T}he parameters of the scoring function were determined by a generalized least square optimization ({LSM}) and alternatively by the support vector machine ({SVM}). {W}ith the generalized {LSM} impaired data for learning with a small set of binding peptides and a large set of non-binding peptides can be treated in a balanced way rendering {LSM} more successful than {SVM}, while for symmetric data sets {SVM} has a slight advantage compared to {LSM}.}, pdf = {../local/Riedesel2004Peptide.pdf}, file = {Riedesel2004Peptide.pdf:local/Riedesel2004Peptide.pdf:PDF}, keywords = {biosvm}, url = {http://www.jsbi.org/journal/IBSB04/IBSB04F004.html} }
@article{Rose2005Correlation, author = {Rose, J. R. and Turkett, W. H., Jr. and Oroian, I. C. and Laegreid, W. W. and Keele, J.}, title = {Correlation of amino acid preference and mammalian viral genome type}, journal = {Bioinformatics}, year = {2005}, abstract = {Motivation: {I}n the event of an outbreak of a disease caused by an initially unknown pathogen, the ability to characterize anonymous sequences prior to isolation and culturing of the pathogen will be helpful. {W}e show that it is possible to classify viral sequences by genome type (ds{DNA}, ss{DNA}, ss{RNA} positive strand, ss{RNA} negative strand, retroid) using amino acid distribution.{R}esults: {I}n this paper we describe the results of analysis of amino acid preference in mammalian viruses. {T}he study was carried out at the genome level as well as two shorter sequence levels: short (300 amino acids) and medium length (660 amino acids). {T}he analysis indicates a correlation between the viral genome types ds{DNA}, ss{DNA}, ss{RNA} positive strand, ss{RNA} negative strand, and retroid and amino acid preference. {W}e investigated three different models of amino acid preference. {T}he simplest amino acid preference model, 1-{AAP}, is a normalized description of the frequency of amino acids in genomes of a viral genome type. {A} slightly more complex model is the ordered pair amino acid preference model (2-{AAP}), which characterizes genomes of different viral genome types by the frequency of ordered pairs of amino acids. {T}he most complex and accurate model is the ordered triple amino acid preference model (3-{AAP}), which is based on ordered triples of amino acids. {T}he results demonstrate that mammalian viral genome types differ in their amino acid preference.{A}vailability: {T}he tools used to format and analyze data and supplementary material are available at http://www.cse.sc.edu/~rose/amino{P}reference/index.html.}, doi = {10.1093/bioinformatics/bti174.}, pdf = {../local/Rose2005Correlation.pdf}, file = {Rose2005Correlation.pdf:local/Rose2005Correlation.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti174v1} }
@article{Rudd2005Eclair, author = {Rudd, S. and Tetko, I. V.}, title = {Eclair--a web service for unravelling species origin of sequences sampled from mixed host interfaces.}, journal = {Nucleic {A}cids {R}es}, year = {2005}, volume = {33}, pages = {W724-7}, number = {Web Server issue}, month = {Jul}, abstract = {The identification of the genes that participate at the biological interface of two species remains critical to our understanding of the mechanisms of disease resistance, disease susceptibility and symbiosis. {T}he sequencing of complementary {DNA} (c{DNA}) libraries prepared from the biological interface between two organisms provides an inexpensive way to identify the novel genes that may be expressed as a cause or consequence of compatible or incompatible interactions. {S}equence classification and annotation of species origin typically use an orthology-based approach and require access to large portions of either genome, or a close relative. {N}ovel species- or clade-specific sequences may have no counterpart within existing databases and remain ambiguous features. {H}ere we present a web-service, {E}clair, which utilizes support vector machines for the classification of the origin of expressed sequence tags stemming from mixed host c{DNA} libraries. {I}n addition to providing an interface for the classification of sequences, users are presented with the opportunity to train a model to suit their preferred species pair. {E}clair is freely available at http://eclair.btk.fi.}, doi = {10.1093/nar/gki434}, pdf = {../local/Rudd2005Eclair.pdf}, file = {Rudd2005Eclair.pdf:local/Rudd2005Eclair.pdf:PDF}, keywords = {biosvm}, pii = {33/suppl_2/W724}, url = {http://dx.doi.org/10.1093/nar/gki434} }
@article{Ruepp2005Assessment, author = {Ruepp, S. and Boess, F. and Suter, L. and de Vera, M. C. and Steiner, G. and Steele, T. and Weiser, T. and Albertini, S.}, title = {Assessment of hepatotoxic liabilities by transcript profiling.}, journal = {Toxicol {A}ppl {P}harmacol}, year = {2005}, month = {Jun}, abstract = {Male {W}istar rats were treated with various model compounds or the appropriate vehicle controls in order to create a reference database for toxicogenomics assessment of novel compounds. {H}epatotoxic compounds in the database were either known hepatotoxicants or showed hepatotoxicity during preclinical testing. {H}istopathology and clinical chemistry data were used to anchor the transcript profiles to an established endpoint (steatosis, cholestasis, direct acting, peroxisomal proliferation or nontoxic/control). {T}hese reference data were analyzed using a supervised learning method (support vector machines, {SVM}) to generate classification rules. {T}his predictive model was subsequently used to assess compounds with regard to a potential hepatotoxic liability. {A} steatotic and a non-hepatotoxic 5{HT}(6) receptor antagonist compound from the same series were successfully discriminated by this toxicogenomics model. {A}dditionally, an example is shown where a hepatotoxic liability was correctly recognized in the absence of pathological findings. {I}n vitro experiments and a dog study confirmed the correctness of the toxicogenomics alert. {A}nother interesting observation was that transcript profiles indicate toxicologically relevant changes at an earlier timepoint than routinely used methods. {T}ogether, these results support the useful application of toxicogenomics in raising alerts for adverse effects and generating mechanistic hypotheses that can be followed up by confirmatory experiments.}, doi = {10.1016/j.taap.2005.05.008}, pdf = {../local/Ruepp2005Assessment.pdf}, file = {Ruepp2005Assessment.pdf:local/Ruepp2005Assessment.pdf:PDF}, keywords = {biosvm}, pii = {S0041-008X(05)00295-4}, url = {http://dx.doi.org/10.1016/j.taap.2005.05.008} }
@incollection{Ratsch2004Accurate, author = {R{\"a}tsch, G. and Sonnenburg, S.}, title = {Accurate splice site detection for {C}aenorhabditis elegans}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {277-298}, abstract = {During the past three years, the support vector machine learning algorithm has been extensively applied within the field of computational biology. {T}he algorithm has been used to detect patterns within and among biological sequences, to classify genes and patients based upon gene expression profiles, and has recently been applied to several new biological problems. {T}his chapter reviews the state of the art with respect to {SVM} applications in computational biology.}, keywords = {biosvm}, owner = {vert} }
@article{Raetsch2005RASE, author = {G. R{\"a}tsch and S. Sonnenburg and B. Sch{\"o}lkopf}, title = {R{ASE}: recognition of alternatively spliced exons in {C}.elegans.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i369-i377}, number = {Suppl. 1}, month = {Jun}, abstract = {M{OTIVATION}: {E}ukaryotic pre-m{RNA}s are spliced to form mature m{RNA}. {P}re-m{RNA} alternative splicing greatly increases the complexity of gene expression. {E}stimates show that more than half of the human genes and at least one-third of the genes of less complex organisms, such as nematodes or flies, are alternatively spliced. {I}n this work, we consider one major form of alternative splicing, namely the exclusion of exons from the transcript. {I}t has been shown that alternatively spliced exons have certain properties that distinguish them from constitutively spliced exons. {A}lthough most recent computational studies on alternative splicing apply only to exons which are conserved among two species, our method only uses information that is available to the splicing machinery, i.e. the {DNA} sequence itself. {W}e employ advanced machine learning techniques in order to answer the following two questions: (1) {I}s a certain exon alternatively spliced? (2) {H}ow can we identify yet unidentified exons within known introns? {RESULTS}: {W}e designed a support vector machine ({SVM}) kernel well suited for the task of classifying sequences with motifs having positional preferences. {I}n order to solve the task (1), we combine the kernel with additional local sequence information, such as lengths of the exon and the flanking introns. {T}he resulting {SVM}-based classifier achieves a true positive rate of 48.5\% at a false positive rate of 1\%. {B}y scanning over single {EST} confirmed exons we identified 215 potential alternatively spliced exons. {F}or 10 randomly selected such exons we successfully performed biological verification experiments and confirmed three novel alternatively spliced exons. {T}o answer question (2), we additionally used {SVM}-based predictions to recognize acceptor and donor splice sites. {C}ombined with the above mentioned features we were able to identify 85.2\% of skipped exons within known introns at a false positive rate of 1\%. {AVAILABILITY}: {D}atasets, model selection results, our predictions and additional experimental results are available at http://www.fml.tuebingen.mpg.de/~raetsch/{RASE} {CONTACT}: {G}unnar.{R}aetsch@tuebingen.mpg.de {SUPPLEMENTARY} {INFORMATION}: http://www.fml.tuebingen.mpg.de/raetsch/{RASE}.}, doi = {10.1093/bioinformatics/bti1053}, pdf = {../local/Raetsch2005RASE.pdf}, file = {Raetsch2005RASE.pdf:local/Raetsch2005RASE.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i369}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1053} }
@article{Roegnvaldsson2004Why, author = {Thorsteinn R{\"o}gnvaldsson and Liwen You}, title = {Why neural networks should not be used for {HIV}-1 protease cleavage site prediction.}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {1702-9}, number = {11}, month = {Jul}, abstract = {S{UMMARY}: {S}everal papers have been published where nonlinear machine learning algorithms, e.g. artificial neural networks, support vector machines and decision trees, have been used to model the specificity of the {HIV}-1 protease and extract specificity rules. {W}e show that the dataset used in these studies is linearly separable and that it is a misuse of nonlinear classifiers to apply them to this problem. {T}he best solution on this dataset is achieved using a linear classifier like the simple perceptron or the linear support vector machine, and it is straightforward to extract rules from these linear models. {W}e identify key residues in peptides that are efficiently cleaved by the {HIV}-1 protease and list the most prominent rules, relating them to experimental results for the {HIV}-1 protease. {MOTIVATION}: {U}nderstanding {HIV}-1 protease specificity is important when designing {HIV} inhibitors and several different machine learning algorithms have been applied to the problem. {H}owever, little progress has been made in understanding the specificity because nonlinear and overly complex models have been used. {RESULTS}: {W}e show that the problem is much easier than what has previously been reported and that linear classifiers like the simple perceptron or linear support vector machines are at least as good predictors as nonlinear algorithms. {W}e also show how sets of specificity rules can be generated from the resulting linear classifiers. {AVAILABILITY}: {T}he datasets used are available at http://www.hh.se/staff/bioinf/}, doi = {10.1093/bioinformatics/bth144}, pdf = {../local/Roegnvaldsson2004Why.pdf}, file = {Roegnvaldsson2004Why.pdf:local/Roegnvaldsson2004Why.pdf:PDF}, keywords = {biosvm}, pii = {bth144}, url = {http://dx.doi.org/10.1093/bioinformatics/bth144} }
@article{Saeh2005Lead, author = {Saeh, J. and Lyne, P. and Takasaki, B. and Cosgrove, D.}, title = {Lead hopping using {SVM} and 3{D} pharmacophore fingerprints.}, journal = {J {C}hem {I}nf {M}odel}, year = {2005}, volume = {45}, pages = {1122-1133}, number = {4}, month = {Jul}, abstract = {The combination of 3{D} pharmacophore fingerprints and the support vector machine classification algorithm has been used to generate robust models that are able to classify compounds as active or inactive in a number of {G}-protein-coupled receptor assays. {T}he models have been tested against progressively more challenging validation sets where steps are taken to ensure that compounds in the validation set are chemically and structurally distinct from the training set. {I}n the most challenging example, we simulate a lead-hopping experiment by excluding an entire class of compounds (defined by a core substructure) from the training set. {T}he left-out active compounds comprised approximately 40\% of the actives. {T}he model trained on the remaining compounds is able to recall 75\% of the actives from the "new" lead series while correctly classifying >99\% of the 5000 inactives included in the validation set.}, doi = {10.1021/ci049732r}, pdf = {../local/Saeh2005Lead.pdf}, file = {Saeh2005Lead.pdf:local/Saeh2005Lead.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci049732r} }
@article{Saetrom2004Predicting, author = {Saetrom, P.}, title = {Predicting the efficacy of short oligonucleotides in antisense and {RNA}i experiments with boosted genetic programming}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {3055-3063}, number = {17}, abstract = {Motivation: {B}oth small interfering {RNA}s (si{RNA}s) and antisense oligonucleotides can selectively block gene expression. {A}lthough the two methods rely on different cellular mechanisms, these methods share the common property that not all oligonucleotides (oligos) are equally effective. {T}hat is, if m{RNA} target sites are picked at random, many of the antisense or si{RNA} oligos will not be effective. {A}lgorithms that can reliably predict the efficacy of candidate oligos can greatly reduce the cost of knockdown experiments, but previous attempts to predict the efficacy of antisense oligos have had limited success. {M}achine learning has not previously been used to predict si{RNA} efficacy. {R}esults: {W}e develop a genetic programming based prediction system that shows promising results on both antisense and si{RNA} efficacy prediction. {W}e train and evaluate our system on a previously published database of antisense efficacies and our own database of si{RNA} efficacies collected from the literature. {T}he best models gave an overall correlation between predicted and observed efficacy of 0.46 on both antisense and si{RNA} data. {A}s a comparison, the best correlations of support vector machine classifiers trained on the same data were 0.40 and 0.30, respectively. {A}vailability: {T}he prediction system uses proprietary hardware and is available for both commercial and strategic academic collaborations. {T}he si{RNA} database is available upon request.}, doi = {10.1093/bioinformatics/bth364}, pdf = {../local/Saetrom2004Predicting.pdf}, file = {Saetrom2004Predicting.pdf:local/Saetrom2004Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3055} }
@article{Saeys2004Feature, author = {Saeys, Y. and Degroeve, S. and Aeyels, D. and Rouzé, P. and Van de Peer, Y.}, title = {Feature selection for splice site prediction: {A} new method using {EDA}-based feature ranking}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {64}, abstract = {Background {T}he identification of relevant biological features in large and complex datasets is an important step towards gaining insight in the processes underlying the data. {O}ther advantages of feature selection include the ability of the classification system to attain good or even better solutions using a restricted subset of features, and a faster classification. {T}hus, robust methods for fast feature selection are of key importance in extracting knowledge from complex biological data. {R}esults {I}n this paper we present a novel method for feature subset selection applied to splice site prediction, based on estimation of distribution algorithms, a more general framework of genetic algorithms. {F}rom the estimated distribution of the algorithm, a feature ranking is derived. {A}fterwards this ranking is used to iteratively discard features. {W}e apply this technique to the problem of splice site prediction, and show how it can be used to gain insight into the underlying biological process of splicing. {C}onclusion {W}e show that this technique proves to be more robust than the traditional use of estimation of distribution algorithms for feature selection: instead of returning a single best subset of features (as they normally do) this method provides a dynamical view of the feature selection process, like the traditional sequential wrapper methods. {H}owever, the method is faster than the traditional techniques, and scales better to datasets described by a large number of features.}, doi = {10.1186/1471-2105-5-64}, pdf = {../local/Saeys2004Feature.pdf}, file = {Saeys2004Feature.pdf:local/Saeys2004Feature.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Saeys2003Fast, author = {Saeys, Y. and Degroeve, S. and Aeyels, D. and Van de Peer, Y. and Rouze, P.}, title = {Fast feature selection using a simple estimation of distribution algorithm: a case study on splice site prediction}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {ii179-ii188}, number = {Suppl. 1}, abstract = {Motivation: {F}eature subset selection is an important preprocessing step for classification. {I}n biology, where structures or processes are described by a large number of features, the elimination of irrelevant and redundant information in a reasonable amount of time has a number of advantages. {I}t enables the classification system to achieve good or even better solutions with a restricted subset of features, allows for a faster classification, and it helps the human expert focus on a relevant subset of features, hence providing useful biological knowledge. {R}esults: {W}e present a heuristic method based on {E}stimation of {D}istribution {A}lgorithms to select relevant subsets of features for splice site prediction in {A}rabidopsis thaliana. {W}e show that this method performs a fast detection of relevant feature subsets using the technique of constrained feature subsets. {C}ompared to the traditional greedy methods the gain in speed can be up to one order of magnitude, with results being comparable or even better than the greedy methods. {T}his makes it a very practical solution for classification tasks that can be solved using a relatively small amount of discriminative features (or feature dependencies), but where the initial set of potential discriminative features is rather large. {K}eywords: {M}achine {L}earning, {F}eature {S}ubset {S}election, {E}stimation of {D}istribution {A}lgorithms, {S}plice {S}ite {P}rediction. {C}ontact: yvsae@gengenp.rug.ac.be}, pdf = {../local/Saeys2003Fast.pdf}, file = {Saeys2003Fast.pdf:local/Saeys2003Fast.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_2/ii179} }
@article{Saigo2004Protein, author = {Saigo, H. and Vert, J.-P. and Ueda, N. and Akutsu, T.}, title = {Protein homology detection using string alignment kernels}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {1682-1689}, number = {11}, abstract = {Motivation: {R}emote homology detection between protein sequences is a central problem in computational biology. {D}iscriminative methods involving support vector machines ({SVM}s) are currently the most effective methods for the problem of superfamily recognition in the {S}tructural {C}lassification {O}f {P}roteins ({SCOP}) database. {T}he performance of {SVM}s depends critically on the kernel function used to quantify the similarity between sequences. {R}esults: {W}e propose new kernels for strings adapted to biological sequences, which we call local alignment kernels. {T}hese kernels measure the similarity between two sequences by summing up scores obtained from local alignments with gaps of the sequences. {W}hen tested in combination with {SVM} on their ability to recognize {SCOP} superfamilies on a benchmark dataset, the new kernels outperform state-of-the-art methods for remote homology detection. {A}vailability: {S}oftware and data available upon request.}, pdf = {../local/Saigo2004Protein.pdf}, file = {Saigo2004Protein.pdf:local/Saigo2004Protein.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/11/1682} }
@article{Sanchez-Carbayo2003Gene, author = {Marta Sanchez-Carbayo and Nicholas D Socci and Juan Jose Lozano and Wentian Li and Elizabeth Charytonowicz and Thomas J Belbin and Michael B Prystowsky and Angel R Ortiz and Geoffrey Childs and Carlos Cordon-Cardo}, title = {Gene discovery in bladder cancer progression using c{DNA} microarrays.}, journal = {Am. {J}. {P}athol.}, year = {2003}, volume = {163}, pages = {505-16}, number = {2}, month = {Aug}, abstract = {To identify gene expression changes along progression of bladder cancer, we compared the expression profiles of early-stage and advanced bladder tumors using c{DNA} microarrays containing 17,842 known genes and expressed sequence tags. {T}he application of bootstrapping techniques to hierarchical clustering segregated early-stage and invasive transitional carcinomas into two main clusters. {M}ultidimensional analysis confirmed these clusters and more importantly, it separated carcinoma in situ from papillary superficial lesions and subgroups within early-stage and invasive tumors displaying different overall survival. {A}dditionally, it recognized early-stage tumors showing gene profiles similar to invasive disease. {D}ifferent techniques including standard t-test, single-gene logistic regression, and support vector machine algorithms were applied to identify relevant genes involved in bladder cancer progression. {C}ytokeratin 20, neuropilin-2, p21, and p33{ING}1 were selected among the top ranked molecular targets differentially expressed and validated by immunohistochemistry using tissue microarrays (n = 173). {T}heir expression patterns were significantly associated with pathological stage, tumor grade, and altered retinoblastoma ({RB}) expression. {M}oreover, p33{ING}1 expression levels were significantly associated with overall survival. {A}nalysis of the annotation of the most significant genes revealed the relevance of critical genes and pathways during bladder cancer progression, including the overexpression of oncogenic genes such as {DEK} in superficial tumors or immune response genes such as {C}d86 antigen in invasive disease. {G}ene profiling successfully classified bladder tumors based on their progression and clinical outcome. {T}he present study has identified molecular biomarkers of potential clinical significance and critical molecular targets associated with bladder cancer progression.}, pdf = {../local/Sanchez-Carbayo2003Gene.pdf}, file = {Sanchez-Carbayo2003Gene.pdf:local/Sanchez-Carbayo2003Gene.pdf:PDF}, keywords = {biosvm}, url = {http://ajp.amjpathol.org/cgi/content/abstract/163/2/505} }
@article{Sarda2005pSLIP, author = {Deepak Sarda and Gek Huey Chua and Kuo-Bin Li and Arun Krishnan}, title = {p{SLIP}: {SVM} based protein subcellular localization prediction using multiple physicochemical properties.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6}, pages = {152}, number = {1}, month = {Jun}, abstract = {B{ACKGROUND}: {P}rotein subcellular localization is an important determinant of protein function and hence, reliable methods for prediction of localization are needed. {A} number of prediction algorithms have been developed based on amino acid compositions or on the {N}-terminal characteristics (signal peptides) of proteins. {H}owever, such approaches lead to a loss of contextual information. {M}oreover, where information about the physicochemical properties of amino acids has been used, the methods employed to exploit that information are less than optimal and could use the information more effectively. {RESULTS}: {I}n this paper, we propose a new algorithm called p{SLIP} which uses {S}upport {V}ector {M}achines ({SVM}s) in conjunction with multiple physicochemical properties of amino acids to predict protein subcellular localization in eukaryotes across six different locations, namely, chloroplast, cytoplasmic, extracellular, mitochondrial, nuclear and plasma membrane. {T}he algorithm was applied to the dataset provided by {P}ark and {K}anehisa and we obtained prediction accuracies for the different classes ranging from 87.7\%-97.0\% with an overall accuracy of 93.1\%. {CONCLUSIONS}: {T}his study presents a physicochemical property based protein localization prediction algorithm. {U}nlike other algorithms, contextual information is preserved by dividing the protein sequences into clusters. {T}he prediction accuracy shows an improvement over other algorithms based on various types of amino acid composition (single, pair and gapped pair). {W}e have also implemented a web server to predict protein localization across the six classes (available at http://pslip.bii.a-star.edu.sg).}, doi = {10.1186/1471-2105-6-152}, pdf = {../local/Sarda2005pSLIP.pdf}, file = {Sarda2005pSLIP.pdf:local/Sarda2005pSLIP.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-152}, url = {http://dx.doi.org/10.1186/1471-2105-6-152} }
@article{Schwender2004pilot, author = {Holger Schwender and Manuela Zucknick and Katja Ickstadt and Hermann M Bolt and G. E. N. I. C. A. network}, title = {A pilot study on the application of statistical classification procedures to molecular epidemiological data.}, journal = {Toxicol {L}ett}, year = {2004}, volume = {151}, pages = {291-9}, number = {1}, month = {Jun}, abstract = {The development of new statistical methods for use in molecular epidemiology comprises the building and application of appropriate classification rules. {T}he aim of this study was to assess various classification methods that can potentially handle genetic interactions. {A} data set comprising genotypes at 25 single nucleotide polymorphic ({SNP}) loci from 518 breast cancer cases and 586 age-matched population-based controls from the {GENICA} study was used to built a classification rule with the discrimination methods {SVM} (support vector machine), {CART} (classification and regression tree), {B}agging, {R}andom {F}orest, {L}ogit{B}oost and k nearest neighbours (k{NN}). {A} blind pilot analysis of the genotypic data set was a first approach to obtain an impression of the statistical structure of the data. {F}urthermore, this analysis was performed to explore classification methods that may be applied to molecular-epidemiological evaluation. {T}he results showed that all blindly applied classification methods had a slightly smaller misclassification rate than a random classification. {T}he findings, nevertheless, suggest that {SNP} data might be useful for the classification of individuals into categories of high or low risk of diseases.}, keywords = {biosvm} }
@book{Schoelkopf2004Kernel, title = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, author = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.-P.}, address = {The MIT Press, Cambridge, Massachussetts}, keywords = {biosvm}, owner = {vert} }
@inproceedings{Seeger2002Covariance, author = {Seeger, M.}, title = {Covariance {K}ernels from {B}ayesian {G}enerative {M}odels}, booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.}, year = {2002}, volume = {14}, pages = {905-912}, pdf = {../local/nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF;nips2001.pdf:http\}, file = {nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF;nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Segal2003Regression, author = {Segal, M. R. and Dahlquist, K. D. and Conklin, B. R.}, title = {Regression approaches for microarray data analysis.}, journal = {J. {C}omput. {B}iol.}, year = {2003}, volume = {10}, pages = {961-980}, number = {6}, abstract = {A variety of new procedures have been devised to handle the two-sample comparison (e.g., tumor versus normal tissue) of gene expression values as measured with microarrays. {S}uch new methods are required in part because of some defining characteristics of microarray-based studies: (i) the very large number of genes contributing expression measures which far exceeds the number of samples (observations) available and (ii) the fact that by virtue of pathway/network relationships, the gene expression measures tend to be highly correlated. {T}hese concerns are exacerbated in the regression setting, where the objective is to relate gene expression, simultaneously for multiple genes, to some external outcome or phenotype. {C}orrespondingly, several methods have been recently proposed for addressing these issues. {W}e briefly critique some of these methods prior to a detailed evaluation of gene harvesting. {T}his reveals that gene harvesting, without additional constraints, can yield artifactual solutions. {R}esults obtained employing such constraints motivate the use of regularized regression procedures such as the lasso, least angle regression, and support vector machines. {M}odel selection and solution multiplicity issues are also discussed. {T}he methods are evaluated using a microarray-based study of cardiomyopathy in transgenic mice.}, doi = {10.1089/106652703322756177}, pdf = {../local/Segal2003Regression.pdf}, file = {Segal2003Regression.pdf:local/Segal2003Regression.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Segal2003Classificationa, author = {Segal, N. H. and Pavlidis, P. and Antonescu, C. R. and Maki, R. G. and Noble, W. S. and DeSantis, D. and Woodruff, J. M. and Lewis, J. J. and Brennan, M. F. and Houghton, A. N. and Cordon-Cardo, C.}, title = {Classification and {S}ubtype {P}rediction of {A}dult {S}oft {T}issue {S}arcoma by {F}unctional {G}enomics}, journal = {Am. {J}. {P}athol.}, year = {2003}, volume = {163}, pages = {691-700}, number = {2}, month = {Aug}, abstract = {Adult soft tissue sarcomas are a heterogeneous group of tumors, including well-described subtypes by histological and genotypic criteria, and pleomorphic tumors typically characterized by non-recurrent genetic aberrations and karyotypic heterogeneity. {T}he latter pose a diagnostic challenge, even to experienced pathologists. {W}e proposed that gene expression profiling in soft tissue sarcoma would identify a genomic-based classification scheme that is useful in diagnosis. {RNA} samples from 51 pathologically confirmed cases, representing nine different histological subtypes of adult soft tissue sarcoma, were examined using the {A}ffymetrix {U}95{A} {G}ene{C}hip. {S}tatistical tests were performed on experimental groups identified by cluster analysis, to find discriminating genes that could subsequently be applied in a support vector machine algorithm. {S}ynovial sarcomas, round-cell/myxoid liposarcomas, clear-cell sarcomas and gastrointestinal stromal tumors displayed remarkably distinct and homogenous gene expression profiles. {P}leomorphic tumors were heterogeneous. {N}otably, a subset of malignant fibrous histiocytomas, a controversialhistological subtype, was identified as a distinct genomic group. {T}he support vector machine algorithm supported a genomic basis for diagnosis, with both high sensitivity and specificity. {I}n conclusion, we showed gene expression profiling to be useful in classification and diagnosis, providing insights into pathogenesis and pointing to potential new therapeutic targets of soft tissue sarcoma.}, pdf = {../local/Segal2003Classificationa.pdf}, file = {Segal2003Classificationa.pdf:local/Segal2003Classificationa.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://ajp.amjpathol.org/cgi/content/abstract/163/2/691} }
@article{Segal2003Classification, author = {Segal, N. H. and Pavlidis, P. and Noble, W. S. and Antonescu, C. R. and Viale, A. and Wesley, U. V. and Busam, K. and Gallardo, H. and DeSantis, D. and Brennan, M. F. and Cordon-Cardo, C. and Wolchok, J. D. and Houghton, A. N.}, title = {Classification of {C}lear-{C}ell {S}arcoma as a {S}ubtype of {M}elanoma by {G}enomic {P}rofiling}, journal = {J. {C}lin. {O}ncol.}, year = {2003}, volume = {21}, pages = {1775-1781}, number = {9}, month = {May}, abstract = {Purpose: {T}o develop a genome-based classification scheme for clear-cell sarcoma ({CCS}), also known as melanoma of soft parts ({MSP}), which would have implications for diagnosis and treatment. {T}his tumor displays characteristic features of soft tissue sarcoma ({STS}), including deep soft tissue primary location and a characteristic translocation, t(12;22)(q13;q12), involving {EWS} and {ATF}1 genes. {CCS}/{MSP} also has typical melanoma features, including immunoreactivity for {S}100 and {HMB}45, pigmentation, {MITF}-{M} expression, and a propensity for regional lymph node metastases. {M}aterials and {M}ethods: {RNA} samples from 21 cell lines and 60 pathologically confirmed cases of {STS}, melanoma, and {CCS}/{MSP} were examined using the {U}95{A} {G}ene{C}hip ({A}ffymetrix, {S}anta {C}lara, {CA}). {H}ierarchical cluster analysis, principal component analysis, and support vector machine ({SVM}) analysis exploited genomic correlations within the data to classify {CCS}/{MSP}. {R}esults: {U}nsupervised analyses demonstrated a clear distinction between {STS} and melanoma and, furthermore, showed that {CCS}/{MSP} cluster with the melanomas as a distinct group. {A} supervised {SVM} learning approach further validated this finding and provided a user-independent approach to diagnosis. {G}enes of interest that discriminate {CCS}/{MSP} included those encoding melanocyte differentiation antigens, {MITF}, {SOX}10, {ERBB}3, and {FGFR}1. {C}onclusion: {G}ene expression profiles support the classification of {CCS}/{MSP} as a distinct genomic subtype of melanoma. {A}nalysis of these gene profiles using the {SVM} may be an important diagnostic tool. {G}enomic analysis identified potential targets for the development of therapeutic strategies in the treatment of this disease.}, doi = {10.1200/JCO.2003.10.108}, pdf = {../local/Segal2003Classification.pdf}, file = {Segal2003Classification.pdf:local/Segal2003Classification.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1200/JCO.2003.10.108} }
@article{Seike2005Proteomic, author = {Seike, M. and Kondo, T. and Fujii, K. and Okano, T. and Yamada, T. and Matsuno, Y. and Gemma, A. and Kudoh, S. and Hirohashi, S.}, title = {Proteomic signatures for histological types of lung cancer.}, journal = {Proteomics}, year = {2005}, month = {Jul}, abstract = {We performed proteomic studies on lung cancer cells to elucidate the mechanisms that determine histological phenotype. {T}hirty lung cancer cell lines with three different histological backgrounds (squamous cell carcinoma, small cell lung carcinoma and adenocarcinoma) were subjected to two-dimensional difference gel electrophoresis (2-{D} {DIGE}) and grouped by multivariate analyses on the basis of their protein expression profiles. 2-{D} {DIGE} achieves more accurate quantification of protein expression by using highly sensitive fluorescence dyes to label the cysteine residues of proteins prior to two-dimensional polyacrylamide gel electrophoresis. {W}e found that hierarchical clustering analysis and principal component analysis divided the cell lines according to their original histology. {S}pot ranking analysis using a support vector machine algorithm and unsupervised classification methods identified 32 protein spots essential for the classification. {T}he proteins corresponding to the spots were identified by mass spectrometry. {N}ext, lung cancer cells isolated from tumor tissue by laser microdissection were classified on the basis of the expression pattern of these 32 protein spots. {B}ased on the expression profile of the 32 spots, the isolated cancer cells were categorized into three histological groups: the squamous cell carcinoma group, the adenocarcinoma group, and a group of carcinomas with other histological types. {I}n conclusion, our results demonstrate the utility of quantitative proteomic analysis for molecular diagnosis and classification of lung cancer cells.}, doi = {10.1002/pmic.200401166}, pdf = {../local/Seike2005Proteomic.pdf}, file = {Seike2005Proteomic.pdf:local/Seike2005Proteomic.pdf:PDF}, keywords = {biosvm proteomics}, url = {http://dx.doi.org/10.1002/pmic.200401166} }
@article{Sen2004Predicting, author = {Sen, T.Z. and Kloczkowski, A. and Jernigan, R.L. and Yan, C. and Honavar, V. and Ho, K.M. and Wang, C.Z. and Ihm, Y. and Cao, H. and Gu, X. and Dobbs, D.}, title = {Predicting binding sites of hydrolase-inhibitor complexes by combining several methods.}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {205}, abstract = {Background {P}rotein-protein interactions play a critical role in protein function. {C}ompletion of many genomes is being followed rapidly by major efforts to identify interacting protein pairs experimentally in order to decipher the networks of interacting, coordinated-in-action proteins. {I}dentification of protein-protein interaction sites and detection of specific amino acids that contribute to the specificity and the strength of protein interactions is an important problem with broad applications ranging from rational drug design to the analysis of metabolic and signal transduction networks. {R}esults {I}n order to increase the power of predictive methods for protein-protein interaction sites, we have developed a consensus methodology for combining four different methods. {T}hese approaches include: data mining using {S}upport {V}ector {M}achines, threading through protein structures, prediction of conserved residues on the protein surface by analysis of phylogenetic trees, and the {C}onservatism of {C}onservatism method of {M}irny and {S}hakhnovich. {R}esults obtained on a dataset of hydrolase-inhibitor complexes demonstrate that the combination of all four methods yield improved predictions over the individual methods. {C}onclusions {W}e developed a consensus method for predicting protein-protein interface residues by combining sequence and structure-based methods. {T}he success of our consensus approach suggests that similar methodologies can be developed to improve prediction accuracies for other bioinformatic problems.}, doi = {10.1186/1471-2105-5-205}, pdf = {../local/Sen2004Predicting.pdf}, file = {Sen2004Predicting.pdf:local/Sen2004Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Senawongse2005Predicting, author = {Pasak Senawongse and Andrew R Dalby and Zheng Rong Yang}, title = {Predicting the phosphorylation sites using hidden markov models and machine learning methods.}, journal = {J {C}hem {I}nf {M}odel}, year = {2005}, volume = {45}, pages = {1147-52}, number = {4}, abstract = {Accurately predicting phosphorylation sites in proteins is an important issue in postgenomics, for which how to efficiently extract the most predictive features from amino acid sequences for modeling is still challenging. {A}lthough both the distributed encoding method and the bio-basis function method work well, they still have some limits in use. {T}he distributed encoding method is unable to code the biological content in sequences efficiently, whereas the bio-basis function method is a nonparametric method, which is often computationally expensive. {A}s hidden {M}arkov models ({HMM}s) can be used to generate one model for one cluster of aligned protein sequences, the aim in this study is to use {HMM}s to extract features from amino acid sequences, where sequence clusters are determined using available biological knowledge. {I}n this novel method, {HMM}s are first constructed using functional sequences only. {B}oth functional and nonfunctional training sequences are then inputted into the trained {HMM}s to generate functional and nonfunctional feature vectors. {F}rom this, a machine learning algorithm is used to construct a classifier based on these feature vectors. {I}t is found in this work that (1) this method provides much better prediction accuracy than the use of {HMM}s only for prediction, and (2) the support vector machines ({SVM}s) algorithm outperforms decision trees and neural network algorithms when they are constructed on the features extracted using the trained {HMM}s.}, doi = {10.1021/ci050047+}, pdf = {../local/Senawongse2005Predicting.pdf}, file = {Senawongse2005Predicting.pdf:local/Senawongse2005Predicting.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci050047+} }
@article{Serra2003Development, author = {Serra, J.R. and Thompson, E.D. and Jurs, P.C.}, title = {Development of binary classification of structural chromosome aberrations for a diverse set of organic compounds from molecular structure}, journal = {Chem. {R}es. {T}oxicol.}, year = {2003}, volume = {16}, pages = {153-163}, number = {2}, abstract = {Classification models are generated to predict in vitro cytogenetic results for a diverse set of 383 organic compounds. {B}oth k-nearest neighbor and support vector machine models are developed. {T}hey are based on calculated molecular structure descriptors. {E}ndpoints used are the labels clastogenic or nonclastogenic according to an in vitro chromosomal aberration assay with {C}hinese hamster lung cells. {C}ompounds that were tested with both a 24 and 48 h exposure are included. {E}ach compound is represented by calculated molecular structure descriptors encoding the topological, electronic, geometrical, or polar surface area aspects of the structure. {S}ubsets of informative descriptors are identified with genetic algorithm feature selection coupled to the appropriate classification algorithm. {T}he overall classification success rate for a k-nearest neighbor classifier built with just six topological descriptors is 81.2% for the training set and 86.5% for an external prediction set. {T}he overall classification success rate for a three-descriptor support vector machine model is 99.7% for the training set, 92.1% for the cross-validation set, and 83.8% for an external prediction set.}, doi = {10.1021/tx020077w}, pdf = {../local/Serra2003Development.pdf}, file = {Serra2003Development.pdf:local/Serra2003Development.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1021/tx020077w} }
@article{Sharan2005motif-based, author = {R. Sharan and E. W Myers}, title = {A motif-based framework for recognizing sequence families.}, journal = {Bioinformatics}, year = {2005}, volume = {21 Suppl 1}, pages = {i387-i393}, month = {Jun}, abstract = {M{OTIVATION}: {M}any signals in biological sequences are based on the presence or absence of base signals and their spatial combinations. {O}ne of the best known examples of this is the signal identifying a core promoter-the site at which the basal transcription machinery starts the transcription of a gene. {O}ur goal is a fully automatic pattern recognition system for a family of sequences, which simultaneously discovers the base signals, their spatial relationships and a classifier based upon them. {RESULTS}: {I}n this paper we present a general method for characterizing a set of sequences by their recurrent motifs. {O}ur approach relies on novel probabilistic models for {DNA} binding sites and modules of binding sites, on algorithms to study them from the data and on a support vector machine that uses the models studied to classify a set of sequences. {W}e demonstrate the applicability of our approach to diverse instances, ranging from families of promoter sequences to a dataset of intronic sequences flanking alternatively spliced exons. {O}n a core promoter dataset our results are comparable with the state-of-the-art {M}c{P}romoter. {O}n a dataset of alternatively spliced exons we outperform a previous approach. {W}e also achieve high success rates in recognizing cell cycle regulated genes. {T}hese results demonstrate that a fully automatic pattern recognition algorithm can meet or exceed the performance of hand-crafted approaches. {AVAILABILITY}: {T}he software and datasets are available from the authors upon request. {CONTACT}: roded@tau.ac.il.}, doi = {10.1093/bioinformatics/bti1002}, pdf = {../local/Sharan2005motif-based.pdf}, file = {Sharan2005motif-based.pdf:local/Sharan2005motif-based.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i387}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1002} }
@inproceedings{She2003Frequent-subsequence-based, author = {She, R. and Chen, F. and Wang, K. and Ester, M. and Gardy, J.L. and Brinkman, F.S.L.}, title = {Frequent-subsequence-based prediction of outer membrane proteins}, booktitle = {K{DD} '03: {P}roceedings of the ninth {ACM} {SIGKDD} international conference on {K}nowledge discovery and data mining}, year = {2003}, pages = {436-445}, publisher = {ACM Press}, abstract = {A number of medically important disease-causing bacteria (collectively called {G}ram-negative bacteria) are noted for the extra "outer" membrane that surrounds their cell. {P}roteins resident in this membrane (outer membrane proteins, or {OMP}s) are of primary research interest for antibiotic and vaccine drug design as they are on the surface of the bacteria and so are the most accessible targets to develop new drugs against. {W}ith the development of genome sequencing technology and bioinformatics, biologists can now deduce all the proteins that are likely produced in a given bacteria and have attempted to classify where proteins are located in a bacterial cell. {H}owever such protein localization programs are currently least accurate when predicting {OMP}s, and so there is a current need for the development of a better {OMP} classifier. {D}ata mining research suggests that the use of frequent patterns has good performance in aiding the development of accurate and efficient classification algorithms. {I}n this paper, we present two methods to identify {OMP}s based on frequent subsequences and test them on all {G}ram-negative bacterial proteins whose localizations have been determined by biological experiments. {O}ne classifier follows an association rule approach, while the other is based on support vector machines ({SVM}s). {W}e compare the proposed methods with the state-of-the-art methods in the biological domain. {T}he results demonstrate that our methods are better both in terms of accurately identifying {OMP}s and providing biological insights that increase our understanding of the structures and functions of these important proteins.}, doi = {10.1145/956750.956800}, pdf = {../local/She2003Frequent-subsequence-based.pdf}, file = {She2003Frequent-subsequence-based.pdf:local/She2003Frequent-subsequence-based.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Shi2005Building, author = {Lei Shi and Fabien Campagne}, title = {Building a protein name dictionary from full text: a machine learning term extraction approach.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6}, pages = {88}, number = {1}, month = {Apr}, abstract = {B{ACKGROUND}: {T}he majority of information in the biological literature resides in full text articles, instead of abstracts. {Y}et, abstracts remain the focus of many publicly available literature data mining tools. {M}ost literature mining tools rely on pre-existing lexicons of biological names, often extracted from curated gene or protein databases. {T}his is a limitation, because such databases have low coverage of the many name variants which are used to refer to biological entities in the literature. {RESULTS}: {W}e present an approach to recognize named entities in full text. {T}he approach collects high frequency terms in an article, and uses support vector machines ({SVM}) to identify biological entity names. {I}t is also computationally efficient and robust to noise commonly found in full text material. {W}e use the method to create a protein name dictionary from a set of 80,528 full text articles. {O}nly 8.3\% of the names in this dictionary match {S}wiss{P}rot description lines. {W}e assess the quality of the dictionary by studying its protein name recognition performance in full text. {CONCLUSION}: {T}his dictionary term lookup method compares favourably to other published methods, supporting the significance of our direct extraction approach. {T}he method is strong in recognizing name variants not found in {S}wiss{P}rot.}, doi = {10.1186/1471-2105-6-88}, pdf = {../local/Shi2005Building.pdf}, file = {Shi2005Building.pdf:local/Shi2005Building.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-88}, url = {http://dx.doi.org/10.1186/1471-2105-6-88} }
@article{Shipp2002Diffuse, author = {Shipp, M. A. and Ross, K. N. and Tamayo, P. and Weng, A. P. and Kutok, J. L. and Aguiar, R. C. T. and Gaasenbeek, M. and Angelo, M. and Reich, M. and Pinkus, G. A. and Ray, T. S. and Koval, M. A. and Last, K. W. and Norton, A. and Lister, T. A. and Mesirov, J. and Neuberg, D. S. and Lander, E. S. and Aster, J. C. and Golub, T. R.}, title = {Diffuse large {B}-cell lymphoma outcome prediction by gene-expression profiling and supervised machine learning}, journal = {Nat. {M}ed.}, year = {2002}, volume = {8}, pages = {68-74}, number = {1}, abstract = {Diffuse large {B}-cell lymphoma ({DLBCL}), the most common lymphoid malignancy in adults, is curable in less than 50% of patients. {P}rognostic models based on pre-treatment characteristics, such as the {I}nternational {P}rognostic {I}ndex ({IPI}), are currently used to predict outcome in {DLBCL}. {H}owever, clinical outcome models identify neither the molecular basis of clinical heterogeneity, nor specific therapeutic targets. {W}e analyzed the expression of 6,817 genes in diagnostic tumor specimens from {DLBCL} patients who received cyclophosphamide, adriamycin, vincristine and prednisone ({CHOP})-based chemotherapy, and applied a supervised learning prediction method to identify cured versus fatal or refractory disease. {T}he algorithm classified two categories of patients with very different five-year overall survival rates (70% versus 12%). {T}he model also effectively delineated patients within specific {IPI} risk categories who were likely to be cured or to die of their disease. {G}enes implicated in {DLBCL} outcome included some that regulate responses to {B}-cell?receptor signaling, critical serine/threonine phosphorylation pathways and apoptosis. {O}ur data indicate that supervised learning classification techniques can predict outcome in {DLBCL} and identify rational targets for intervention.}, doi = {10.1038/nm0102-68}, pdf = {../local/Shipp2002Diffuse.pdf}, file = {Shipp2002Diffuse.pdf:local/Shipp2002Diffuse.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Siepen2003Beta, author = {Siepen, J. A. and Radford, S. E. and Westhead, D. R.}, title = {Beta {E}dge strands in protein structure prediction and aggregation}, journal = {Protein {S}ci.}, year = {2003}, volume = {12}, pages = {2348-2359}, number = {10}, abstract = {It is well established that recognition between exposed edges of {beta}-sheets is an important mode of protein-protein interaction and can have pathological consequences; for instance, it has been linked to the aggregation of proteins into a fibrillar structure, which is associated with a number of predominantly neurodegenerative disorders. {A} number of protective mechanisms have evolved in the edge strands of {beta}-sheets, preventing the aggregation and insolubility of most natural {beta}-sheet proteins. {S}uch mechanisms are unfavorable in the interior of a {beta}-sheet. {T}he problem of distinguishing edge strands from central strands based on sequence information alone is important in predicting residues and mutations likely to be involved in aggregation, and is also a first step in predicting folding topology. {H}ere we report support vector machine ({SVM}) and decision tree methods developed to classify edge strands from central strands in a representative set of protein domains. {I}nterestingly, rules generated by the decision tree method are in close agreement with our knowledge of protein structure and are potentially useful in a number of different biological applications. {W}hen trained on strands from proteins of known structure, using structure-based ({D}ictionary of {S}econdary {S}tructure in {P}roteins) strand assignments, both methods achieved mean cross-validated, prediction accuracies of ~78%. {T}hese accuracies were reduced when strand assignments from secondary structure prediction were used. {F}urther investigation of this effect revealed that it could be explained by a significant reduction in the accuracy of standard secondary structure prediction methods for edge strands, in comparison with central strands.}, doi = {10.1110/ps.03234503}, pdf = {../local/Siepen2003beta.pdf}, file = {Siepen2003beta.pdf:local/Siepen2003beta.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.proteinscience.org/cgi/content/abstract/12/10/2348} }
@inproceedings{Sonnenburg2002New, author = {Sonnenburg, S. and R{\"a}tsch, G. and Jagota, A. and M{\"u}ller, K.-R.}, title = {New methods for splice-site recognition}, booktitle = {Proc. {I}nternational conference on artificial {N}eural {N}etworks ? {ICANN}?02}, year = {2002}, editor = {JR. Dorronsoro}, number = {2415}, series = {LNCS}, pages = {329-336}, publisher = {Springer Berlin}, pdf = {../local/Sonnenburg2002New.pdf}, file = {Sonnenburg2002New.pdf:local/Sonnenburg2002New.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Sorich2004Rapid, author = {Michael J Sorich and Ross A McKinnon and John O Miners and David A Winkler and Paul A Smith}, title = {Rapid prediction of chemical metabolism by human {UDP}-glucuronosyltransferase isoforms using quantum chemical descriptors derived with the electronegativity equalization method.}, journal = {J {M}ed {C}hem}, year = {2004}, volume = {47}, pages = {5311-7}, number = {21}, month = {Oct}, abstract = {This study aimed to evaluate in silico models based on quantum chemical ({QC}) descriptors derived using the electronegativity equalization method ({EEM}) and to assess the use of {QC} properties to predict chemical metabolism by human {UDP}-glucuronosyltransferase ({UGT}) isoforms. {V}arious {EEM}-derived {QC} molecular descriptors were calculated for known {UGT} substrates and nonsubstrates. {C}lassification models were developed using support vector machine and partial least squares discriminant analysis. {I}n general, the most predictive models were generated with the support vector machine. {C}ombining {QC} and 2{D} descriptors (from previous work) using a consensus approach resulted in a statistically significant improvement in predictivity (to 84\%) over both the {QC} and 2{D} models and the other methods of combining the descriptors. {EEM}-derived {QC} descriptors were shown to be both highly predictive and computationally efficient. {I}t is likely that {EEM}-derived {QC} properties will be generally useful for predicting {ADMET} and physicochemical properties during drug discovery.}, doi = {10.1021/jm0495529}, pdf = {../local/Sorich2004Rapid.pdf}, file = {Sorich2004Rapid.pdf:local/Sorich2004Rapid.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/jm0495529} }
@article{Sorich2003Comparison, author = {M. J. Sorich and J. O. Miners and R. A. McKinnon and D. A. Winkler and F. R. Burden and P. A. Smith}, title = {Comparison of linear and nonlinear classification algorithms for the prediction of drug and chemical metabolism by human {UDP}-glucuronosyltransferase isoforms.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {2019-24}, number = {6}, abstract = {Partial least squares discriminant analysis ({PLSDA}), {B}ayesian regularized artificial neural network ({BRANN}), and support vector machine ({SVM}) methodologies were compared by their ability to classify substrates and nonsubstrates of 12 isoforms of human {UDP}-glucuronosyltransferase ({UGT}), an enzyme "superfamily" involved in the metabolism of drugs, nondrug xenobiotics, and endogenous compounds. {S}imple two-dimensional descriptors were used to capture chemical information. {F}or each data set, 70\% of the data were used for training, and the remainder were used to assess the generalization performance. {I}n general, the {SVM} methodology was able to produce models with the best predictive performance, followed by {BRANN} and then {PLSDA}. {H}owever, a small number of data sets showed either equivalent or better predictability using {PLSDA}, which may indicate relatively linear relationships in these data sets. {A}ll {SVM} models showed predictive ability (>60\% of test set predicted correctly) and five out of the 12 test sets showed excellent prediction (>80\% prediction accuracy). {T}hese models represent the first use of pattern recognition methods to discriminate between substrates and nonsubstrates of human drug metabolizing enzymes and the first thorough assessment of three classification algorithms using multiple metabolic data sets.}, doi = {10.1021/ci034108k}, pdf = {../local/Sorich2003Comparison.pdf}, file = {Sorich2003Comparison.pdf:local/Sorich2003Comparison.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci034108k} }
@inproceedings{Stapley2002Predicting, author = {Stapley, B.J. and Kelley, L.A. and Sternberg, M.J.}, title = {Predicting the sub-cellular location of proteins from text using support vector machines.}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002}, year = {2002}, editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin Lauerdale and Teri E. Klein}, pages = {374-385}, publisher = {World Scientific}, abstract = {We present an automatic method to classify the sub-cellular location of proteins based on the text of relevant medline abstracts. {F}or each protein, a vector of terms is generated from medline abstracts in which the protein/gene's name or synonym occurs. {A} {S}upport {V}ector {M}achine ({SVM}) is used to automatically partition the term space and to thus discriminate the textual features that define sub-cellular location. {T}he method is benchmarked on a set of proteins of known sub-cellular location from {S}. cerevisiae. {N}o prior knowledge of the problem domain nor any natural language processing is used at any stage. {T}he method out-performs support vector machines trained on amino acid composition and has comparable performance to rule-based text classifiers. {C}ombining text with protein amino-acid composition improves recall for some sub-cellular locations. {W}e discuss the generality of the method and its potential application to a variety of biological classification problems.}, pdf = {../local/Stapley2002Predicting.pdf}, file = {Stapley2002Predicting.pdf:local/Stapley2002Predicting.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.smi.stanford.edu/projects/helix/psb02/stapley.pdf} }
@article{Statnikov2004Methods, author = {Alexander Statnikov and Constantin F Aliferis and Ioannis Tsamardinos}, title = {Methods for multi-category cancer diagnosis from gene expression data: a comprehensive evaluation to inform decision support system development.}, journal = {Medinfo}, year = {2004}, volume = {11}, pages = {813-7}, number = {Pt 2}, abstract = {Cancer diagnosis is a major clinical applications area of gene expression microarray technology. {W}e are seeking to develop a system for cancer diagnostic model creation based on microarray data. {I}n order to equip the system with the optimal combination of data modeling methods, we performed a comprehensive evaluation of several major classification algorithms, gene selection methods, and cross-validation designs using 11 datasets spanning 74 diagnostic categories (41 cancer types and 12 normal tissue types). {T}he {M}ulti-{C}ategory {S}upport {V}ector {M}achine techniques by {C}rammer and {S}inger, {W}eston and {W}atkins, and one-versus-rest were found to be the best methods and they outperform other learning algorithms such as {K}-{N}earest {N}eighbors and {N}eural {N}etworks often to a remarkable degree. {G}ene selection techniques are shown to significantly improve classification performance. {T}hese results guided the development of a software system that fully automates cancer diagnostic model construction with quality on par with or better than previously published results derived by expert human analysts.}, keywords = {biosvm}, pii = {D040004907} }
@article{Statnikov2005comprehensive, author = {Statnikov, A. and Aliferis, C. F. and Tsamardinos, I. and Hardin, D. and Levy, S.}, title = {A comprehensive evaluation of multicategory classification methods for microarray gene expression cancer diagnosis}, journal = {Bioinformatics}, year = {2005}, note = {To appear}, abstract = {Motivation: {C}ancer diagnosis is one of the most important emerging clinical applications of gene expression microarray technology. {W}e are seeking to develop a computer system for powerful and reliable cancer diagnostic model creation based on microarray data. {T}o keep a realistic perspective on clinical applications we focus on multicategory diagnosis. {I}n order to equip the system with the optimum combination of classifier, gene selection and cross-validation methods, we performed a systematic and comprehensive evaluation of several major algorithms for multicategory classification, several gene selection methods, multiple ensemble classifier methods, and two cross validation designs using 11 datasets spanning 74 diagnostic categories and 41 cancer types and 12 normal tissue types.{R}esults: {M}ulticategory {S}upport {V}ector {M}achines ({MC}-{SVM}s) are the most effective classifiers in performing accurate cancer diagnosis from gene expression data. {T}he {MC}-{SVM} techniques by {C}rammer and {S}inger, {W}eston and {W}atkins, and one-versus-rest were found to be the best methods in this domain. {MC}-{SVM}s outperform other popular machine learning algorithms such as {K}-{N}earest {N}eighbors, {B}ackpropagation and {P}robabilistic {N}eural {N}etworks, often to a remarkable degree. {G}ene selection techniques can significantly improve classification performance of both {MC}-{SVM}s and other non-{SVM} learning algorithms. {E}nsemble classifiers do not generally improve performance of the best non-ensemble models. {T}hese results guided the construction of a software system {GEMS} ({G}ene {E}xpression {M}odel {S}elector) that automates high-quality model construction and enforces sound optimization and performance estimation procedures. {T}his is the first such system to be informed by a rigorous comparative analysis of the available algorithms and datasets.{A}vailability: {T}he software system {GEMS} is available for download from http://www.gems-system.org for non-commercial use.}, pdf = {../local/Statnikov2005comprehensive.pdf}, file = {Statnikov2005comprehensive.pdf:local/Statnikov2005comprehensive.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti033v1} }
@article{Steiner2004Discriminating, author = {Guido Steiner and Laura Suter and Franziska Boess and Rodolfo Gasser and Maria Cristina de Vera and Silvio Albertini and Stefan Ruepp}, title = {Discriminating different classes of toxicants by transcript profiling.}, journal = {Environ. {H}ealth {P}erspect.}, year = {2004}, volume = {112}, pages = {1236-48}, number = {12}, month = {Aug}, abstract = {Male rats were treated with various model compounds or the appropriate vehicle controls. {M}ost substances were either well-known hepatotoxicants or showed hepatotoxicity during preclinical testing. {T}he aim of the present study was to determine if biological samples from rats treated with various compounds can be classified based on gene expression profiles. {I}n addition to gene expression analysis using microarrays, a complete serum chemistry profile and liver and kidney histopathology were performed. {W}e analyzed hepatic gene expression profiles using a supervised learning method (support vector machines; {SVM}s) to generate classification rules and combined this with recursive feature elimination to improve classification performance and to identify a compact subset of probe sets with potential use as biomarkers. {T}wo different {SVM} algorithms were tested, and the models obtained were validated with a compound-based external cross-validation approach. {O}ur predictive models were able to discriminate between hepatotoxic and nonhepatotoxic compounds. {F}urthermore, they predicted the correct class of hepatotoxicant in most cases. {W}e provide an example showing that a predictive model built on transcript profiles from one rat strain can successfully classify profiles from another rat strain. {I}n addition, we demonstrate that the predictive models identify nonresponders and are able to discriminate between gene changes related to pharmacology and toxicity. {T}his work confirms the hypothesis that compound classification based on gene expression data is feasible.}, pdf = {../local/Steiner2004Discriminating.pdf}, file = {Steiner2004Discriminating.pdf:local/Steiner2004Discriminating.pdf:PDF}, keywords = {biosvm}, url = {http://ehp.niehs.nih.gov/txg/docs/2004/7036/abstract.html} }
@article{Su2001Molecular, author = {Su, A. I. and Welsh, J. B. and Sapinoso, L. M. and Kern, S. G. and Dimitrov, P. and Lapp, H. and Schultz, P. G. and Powell, S. M. and Moskaluk, C. A. and Frierson, H. F.Jr. and Hampton, G. M.}, title = {Molecular {C}lassification of {H}uman {C}arcinomas by {U}se of {G}ene {E}xpression {S}ignatures}, journal = {Cancer {R}es.}, year = {2001}, volume = {61}, pages = {7388-7393}, number = {20}, abstract = {Classification of human tumors according to their primary anatomical site of origin is fundamental for the optimal treatment of patients with cancer. {H}ere we describe the use of large-scale {RNA} profiling and supervised machine learning algorithms to construct a first-generation molecular classification scheme for carcinomas of the prostate, breast, lung, ovary, colorectum, kidney, liver, pancreas, bladder/ureter, and gastroesophagus, which collectively account for [~]70% of all cancer-related deaths in the {U}nited {S}tates. {T}he classification scheme was based on identifying gene subsets whose expression typifies each cancer class, and we quantified the extent to which these genes are characteristic of a specific tumor type by accurately and confidently predicting the anatomical site of tumor origin for 90% of 175 carcinomas, including 9 of 12 metastatic lesions. {T}he predictor gene subsets include those whose expression is typical of specific types of normal epithelial differentiation, as well as other genes whose expression is elevated in cancer. {T}his study demonstrates the feasibility of predicting the tissue origin of a carcinoma in the context of multiple cancer classes.}, pdf = {../local/Su2001Molecular.pdf.html}, file = {Su2001Molecular.pdf.html:local/Su2001Molecular.pdf.html:PDF}, keywords = {biosvm, breastcancer}, owner = {jeanphilippevert}, url = {http://cancerres.aacrjournals.org/cgi/content/abstract/61/20/7388} }
@article{Su2003RankGene, author = {Su, Yang and Murali, T.M. and Pavlovic, Vladimir and Schaffer, Michael and Kasif, Simon}, title = {{{R}ank{G}ene}: identification of diagnostic genes based on expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1578-1579}, number = {12}, abstract = {Summary: {R}ank{G}ene is a program for analyzing gene expression data and computing diagnostic genes based on their predictive power in distinguishing between different types of samples. {T}he program integrates into one system a variety of popular ranking criteria, ranging from the traditional t-statistic to one-dimensional support vector machines. {T}his flexibility makes {R}ank{G}ene a useful tool in gene expression analysis and feature selection. {A}vailability: http://genomics10.bu.edu/yangsu/rankgene {C}ontact: murali@bu.edu}, pdf = {../local/Su2003RankGene.pdf}, file = {Su2003RankGene.pdf:local/Su2003RankGene.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/12/1578} }
@article{Sun2003Identifying, author = {Sun, Y.F. and Fan, X.D. and Li, Y.D.}, title = {Identifying splicing sites in eukaryotic {RNA}: support vector machine approach.}, journal = {Comput. {B}iol. {M}ed.}, year = {2003}, volume = {33}, pages = {17-29}, number = {1}, abstract = {We introduce a new method for splicing sites prediction based on the theory of support vector machines ({SVM}). {T}he {SVM} represents a new approach to supervised pattern classification and has been successfully applied to a wide range of pattern recognition problems. {I}n the process of splicing sites prediction, the statistical information of {RNA} secondary structure in the vicinity of splice sites, e.g. donor and acceptor sites, is introduced in order to compare recognition ratio of true positive and true negative. {F}rom the results of comparison, addition of structural information has brought no significant benefit for the recognition of splice sites and had even lowered the rate of recognition. {O}ur results suggest that, through three cross validation, the {SVM} method can achieve a good performance for splice sites identification.}, doi = {10.1016/S0010-4825(02)00057-4}, pdf = {../local/Sun2003Identifying.pdf}, file = {Sun2003Identifying.pdf:local/Sun2003Identifying.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/S0010-4825(02)00057-4} }
@article{Swamidass2005Kernels, author = {Swamidass, S. J. and Chen, J. and Bruand, J. and Phung, P. and Ralaivola, L. and Baldi, P.}, title = {Kernels for small molecules and the prediction of mutagenicity, toxicity and anti-cancer activity.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i359-i368}, number = {Suppl. 1}, month = {Jun}, abstract = {M{OTIVATION}: {S}mall molecules play a fundamental role in organic chemistry and biology. {T}hey can be used to probe biological systems and to discover new drugs and other useful compounds. {A}s increasing numbers of large datasets of small molecules become available, it is necessary to develop computational methods that can deal with molecules of variable size and structure and predict their physical, chemical and biological properties. {RESULTS}: {H}ere we develop several new classes of kernels for small molecules using their 1{D}, 2{D} and 3{D} representations. {I}n 1{D}, we consider string kernels based on {SMILES} strings. {I}n 2{D}, we introduce several similarity kernels based on conventional or generalized fingerprints. {G}eneralized fingerprints are derived by counting in different ways subpaths contained in the graph of bonds, using depth-first searches. {I}n 3{D}, we consider similarity measures between histograms of pairwise distances between atom classes. {T}hese kernels can be computed efficiently and are applied to problems of classification and prediction of mutagenicity, toxicity and anti-cancer activity on three publicly available datasets. {T}he results derived using cross-validation methods are state-of-the-art. {T}radeoffs between various kernels are briefly discussed. {AVAILABILITY}: {D}atasets available from http://www.igb.uci.edu/servers/servers.html {CONTACT}: pfbaldi@ics.uci.edu.}, doi = {10.1093/bioinformatics/bti1055}, pdf = {../local/Swamidass2005Kernels.pdf}, file = {Swamidass2005Kernels.pdf:Swamidass2005Kernels.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i359}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1055} }
@article{Takaoka2003Development, author = {Y. Takaoka and Y. Endo and S. Yamanobe and H. Kakinuma and T. Okubo and Y. Shimazaki and T. Ota and S. Sumiya and K. Yoshikawa}, title = {Development of a method for evaluating drug-likeness and ease of synthesis using a data set in which compounds are assigned scores based on chemists' intuition.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {1269-75}, number = {4}, abstract = {The concept of drug-likeness, an important characteristic for any compound in a screening library, is nevertheless difficult to pin down. {B}ased on our belief that this concept is implicit within the collective experience of working chemists, we devised a data set to capture an intuitive human understanding of both this characteristic and ease of synthesis, a second key characteristic. {F}ive chemists assigned a pair of scores to each of 3980 diverse compounds, with the component scores of each pair corresponding to drug-likeness and ease of synthesis, respectively. {U}sing this data set, we devised binary classifiers with an artificial neural network and a support vector machine. {T}hese models were found to efficiently eliminate compounds that are not drug-like and/or hard-to-synthesize derivatives, demonstrating the suitability of these models for use as compound acquisition filters.}, doi = {10.1021/ci034043l}, pdf = {../local/Takaoka2003Development.pdf}, file = {Takaoka2003Development.pdf:local/Takaoka2003Development.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci034043l} }
@article{Takeuchi2005Bio-medical, author = {Koichi Takeuchi and Nigel Collier}, title = {Bio-medical entity extraction using support vector machines.}, journal = {Artif. {I}ntell. {M}ed.}, year = {2005}, volume = {33}, pages = {125-37}, number = {2}, month = {Feb}, abstract = {O{BJECTIVE}: {S}upport vector machines ({SVM}s) have achieved state-of-the-art performance in several classification tasks. {I}n this article we apply them to the identification and semantic annotation of scientific and technical terminology in the domain of molecular biology. {T}his illustrates the extensibility of the traditional named entity task to special domains with large-scale terminologies such as those in medicine and related disciplines. {METHODS} {AND} {MATERIALS}: {T}he foundation for the model is a sample of text annotated by a domain expert according to an ontology of concepts, properties and relations. {T}he model then learns to annotate unseen terms in new texts and contexts. {T}he results can be used for a variety of intelligent language processing applications. {W}e illustrate {SVM}s capabilities using a sample of 100 journal abstracts texts taken from the {human, blood cell, transcription factor} domain of {MEDLINE}. {RESULTS}: {A}pproximately 3400 terms are annotated and the model performs at about 74\% {F}-score on cross-validation tests. {A} detailed analysis based on empirical evidence shows the contribution of various feature sets to performance. {CONCLUSION}: {O}ur experiments indicate a relationship between feature window size and the amount of training data and that a combination of surface words, orthographic features and head noun features achieve the best performance among the feature sets tested.}, doi = {10.1016/j.artmed.2004.07.019}, pdf = {../local/Takeuchi2005Bio-medical.pdf}, file = {Takeuchi2005Bio-medical.pdf:local/Takeuchi2005Bio-medical.pdf:PDF}, keywords = {biosvm}, pii = {S0933-3657(04)00130-7}, url = {http://dx.doi.org/10.1016/j.artmed.2004.07.019} }
@article{Tang2005Discovering, author = {Thomas Tang and Jinbo Xu and Ming Li}, title = {Discovering sequence-structure motifs from protein segments and two applications.}, journal = {Pac {S}ymp {B}iocomput}, year = {2005}, pages = {370-81}, abstract = {We present a novel method for clustering short protein segments having strong sequence-structure correlations, and demonstrate that these clusters contain useful structural information via two applications. {W}hen applied to local tertiary structure prediction, we achieve approximately 60\% accuracy with a novel dynamic programming algorithm. {W}hen applied to secondary structure prediction based on {S}upport {V}ector {M}achines, we obtain a approximately 2\% gain in {Q}3 performance by incorporating cluster-derived data into training and classification. {T}hese encouraging results illustrate the great potential of using conserved local motifs to tackle protein structure predictions and possibly other important problems in biology.}, keywords = {biosvm} }
@article{Teramoto2005Prediction, author = {Reiji Teramoto and Mikio Aoki and Toru Kimura and Masaharu Kanaoka}, title = {Prediction of si{RNA} functionality using generalized string kernel and support vector machine.}, journal = {F{EBS} {L}ett.}, year = {2005}, volume = {579}, pages = {2878-82}, number = {13}, month = {May}, abstract = {Small interfering {RNA}s (si{RNA}s) are becoming widely used for sequence-specific gene silencing in mammalian cells, but designing an effective si{RNA} is still a challenging task. {I}n this study, we developed an algorithm for predicting si{RNA} functionality by using generalized string kernel ({GSK}) combined with support vector machine ({SVM}). {W}ith {GSK}, si{RNA} sequences were represented as vectors in a multi-dimensional feature space according to the numbers of subsequences in each si{RNA}, and subsequently classified with {SVM} into effective or ineffective si{RNA}s. {W}e applied this algorithm to published si{RNA}s, and could classify effective and ineffective si{RNA}s with 90.6\%, 86.2\% accuracy, respectively.}, doi = {10.1016/j.febslet.2005.04.045}, pdf = {../local/Teramoto2005Prediction.pdf}, file = {Teramoto2005Prediction.pdf:local/Teramoto2005Prediction.pdf:PDF}, keywords = {sirna biosvm}, pii = {S0014-5793(05)00520-X}, url = {http://dx.doi.org/10.1016/j.febslet.2005.04.045} }
@article{Tobita2005discriminant, author = {Tobita, M. and Nishikawa, T. and Nagashima, R.}, title = {A discriminant model constructed by the support vector machine method for {HERG} potassium channel inhibitors.}, journal = {Bioorg. {M}ed. {C}hem. {L}ett.}, year = {2005}, volume = {15}, pages = {2886-90}, number = {11}, month = {Jun}, abstract = {H{ERG} attracts attention as a risk factor for arrhythmia, which might trigger torsade de pointes. {A} highly accurate classifier of chemical compounds for inhibition of the {HERG} potassium channel is constructed using support vector machine. {F}or two test sets, our discriminant models achieved 90\% and 95\% accuracy, respectively. {T}he classifier is even applied for the prediction of cardio vascular adverse effects to achieve about 70\% accuracy. {W}hile modest inhibitors are partly characterized by properties linked to global structure of a molecule including hydrophobicity and diameter, strong inhibitors are exclusively characterized by properties linked to substructures of a molecule.}, doi = {10.1016/j.bmcl.2005.03.080}, pdf = {../local/Tobita2005discriminant.pdf}, file = {Tobita2005discriminant.pdf:local/Tobita2005discriminant.pdf:PDF}, keywords = {biosvm chemoinformatics herg}, pii = {S0960-894X(05)00403-8}, url = {http://dx.doi.org/10.1016/j.bmcl.2005.03.080} }
@article{Tothill2005expression-based, author = {Richard W Tothill and Adam Kowalczyk and Danny Rischin and Alex Bousioutas and Izhak Haviv and Ryan K van Laar and Paul M Waring and John Zalcberg and Robyn Ward and Andrew V Biankin and Robert L Sutherland and Susan M Henshall and Kwun Fong and Jonathan R Pollack and David D L Bowtell and Andrew J Holloway}, title = {An expression-based site of origin diagnostic method designed for clinical application to cancer of unknown origin.}, journal = {Cancer {R}es.}, year = {2005}, volume = {65}, pages = {4031-40}, number = {10}, month = {May}, abstract = {Gene expression profiling offers a promising new technique for the diagnosis and prognosis of cancer. {W}e have applied this technology to build a clinically robust site of origin classifier with the ultimate aim of applying it to determine the origin of cancer of unknown primary ({CUP}). {A} single c{DNA} microarray platform was used to profile 229 primary and metastatic tumors representing 14 tumor types and multiple histologic subtypes. {T}his data set was subsequently used for training and validation of a support vector machine ({SVM}) classifier, demonstrating 89\% accuracy using a 13-class model. {F}urther, we show the translation of a five-class classifier to a quantitative {PCR}-based platform. {S}electing 79 optimal gene markers, we generated a quantitative-{PCR} low-density array, allowing the assay of both fresh-frozen and formalin-fixed paraffin-embedded ({FFPE}) tissue. {D}ata generated using both quantitative {PCR} and microarray were subsequently used to train and validate a cross-platform {SVM} model with high prediction accuracy. {F}inally, we applied our {SVM} classifiers to 13 cases of {CUP}. {W}e show that the microarray {SVM} classifier was capable of making high confidence predictions in 11 of 13 cases. {T}hese predictions were supported by comprehensive review of the patients' clinical histories.}, doi = {10.1158/0008-5472.CAN-04-3617}, pdf = {../local/Tothill2005expression-based.pdf}, file = {Tothill2005expression-based.pdf:Tothill2005expression-based.pdf:PDF}, keywords = {biosvm microarray}, pii = {65/10/4031}, url = {http://dx.doi.org/10.1158/0008-5472.CAN-04-3617} }
@article{Tsai2004Gene, author = {Tsai, C.A. and Chen, C.H. and Lee, T.C. and Ho, I.C. and Yang, U.C. and Chen, J.J.}, title = {Gene selection for sample classifications in microarray experiments.}, journal = {D{NA} {C}ell {B}iol.}, year = {2004}, volume = {23}, pages = {607-614}, number = {10}, abstract = {D{NA} microarray technology provides useful tools for profiling global gene expression patterns in different cell/tissue samples. {O}ne major challenge is the large number of genes relative to the number of samples. {T}he use of all genes can suppress or reduce the performance of a classification rule due to the noise of nondiscriminatory genes. {S}election of an optimal subset from the original gene set becomes an important prestep in sample classification. {I}n this study, we propose a family-wise error ({FWE}) rate approach to selection of discriminatory genes for two-sample or multiple-sample classification. {T}he {FWE} approach controls the probability of the number of one or more false positives at a prespecified level. {A} public colon cancer data set is used to evaluate the performance of the proposed approach for the two classification methods: k nearest neighbors (k-{NN}) and support vector machine ({SVM}). {T}he selected gene sets from the proposed procedure appears to perform better than or comparable to several results reported in the literature using the univariate analysis without performing multivariate search. {I}n addition, we apply the {FWE} approach to a toxicogenomic data set with nine treatments (a control and eight metals, {A}s, {C}d, {N}i, {C}r, {S}b, {P}b, {C}u, and {A}s{V}) for a total of 55 samples for a multisample classification. {T}wo gene sets are considered: the gene set omega{F} formed by the {ANOVA} {F}-test, and a gene set omega{T} formed by the union of one-versus-all t-tests. {T}he predicted accuracies are evaluated using the internal and external crossvalidation. {U}sing the {SVM} classification, the overall accuracies to predict 55 samples into one of the nine treatments are above 80% for internal crossvalidation. {O}mega{F} has slightly higher accuracy rates than omega{T}. {T}he overall predicted accuracies are above 70% for the external crossvalidation; the two gene sets omega{T} and omega{F} performed equally well.}, doi = {10.1089/1044549042476947}, pdf = {../local/Tsai2004Gene.pdf}, file = {Tsai2004Gene.pdf:local/Tsai2004Gene.pdf:PDF}, keywords = {biosvm microarray}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1089/1044549042476947} }
@article{Tsirigos2005sensitive, author = {Tsirigos, A. and Rigoutsos, I.}, title = {A sensitive, support-vector-machine method for the detection of horizontal gene transfers in viral, archaeal and bacterial genomes.}, journal = {Nucleic {A}cids {R}es.}, year = {2005}, volume = {33}, pages = {3699-707}, number = {12}, abstract = {In earlier work, we introduced and discussed a generalized computational framework for identifying horizontal transfers. {T}his framework relied on a gene's nucleotide composition, obviated the need for knowledge of codon boundaries and database searches, and was shown to perform very well across a wide range of archaeal and bacterial genomes when compared with previously published approaches, such as {C}odon {A}daptation {I}ndex and {C} + {G} content. {N}onetheless, two considerations remained outstanding: we wanted to further increase the sensitivity of detecting horizontal transfers and also to be able to apply the method to increasingly smaller genomes. {I}n the discussion that follows, we present such a method, {W}n-{SVM}, and show that it exhibits a very significant improvement in sensitivity compared with earlier approaches. {W}n-{SVM} uses a one-class support-vector machine and can learn using rather small training sets. {T}his property makes {W}n-{SVM} particularly suitable for studying small-size genomes, similar to those of viruses, as well as the typically larger archaeal and bacterial genomes. {W}e show experimentally that the new method results in a superior performance across a wide range of organisms and that it improves even upon our own earlier method by an average of 10\% across all examined genomes. {A}s a small-genome case study, we analyze the genome of the human cytomegalovirus and demonstrate that {W}n-{SVM} correctly identifies regions that are known to be conserved and prototypical of all beta-herpesvirinae, regions that are known to have been acquired horizontally from the human host and, finally, regions that had not up to now been suspected to be horizontally transferred. {A}typical region predictions for many eukaryotic viruses, including the alpha-, beta- and gamma-herpesvirinae, and 123 archaeal and bacterial genomes, have been made available online at http://cbcsrv.watson.ibm.com/{HGT}_{SVM}/.}, doi = {10.1093/nar/gki660}, pdf = {../local/Tsirigos2005sensitive.pdf}, file = {Tsirigos2005sensitive.pdf:local/Tsirigos2005sensitive.pdf:PDF}, keywords = {biosvm}, pii = {33/12/3699}, url = {http://dx.doi.org/10.1093/nar/gki660} }
@article{Tsuda2003em, author = {Tsuda, K. and Akaho, S. and Asai, K.}, title = {The em {A}lgorithm for {K}ernel {M}atrix {C}ompletion with {A}uxiliary {D}ata}, journal = {J. {M}ach. {L}earn. {R}es.}, year = {2003}, volume = {4}, pages = {67-81}, abstract = {In biological data, it is often the case that observed data are available only for a subset of samples. {W}hen a kernel matrix is derived from such data, we have to leave the entries for unavailable samples as missing. {I}n this paper, the missing entries are completed by exploiting an auxiliary kernel matrix derived from another information source. {T}he parametric model of kernel matrices is created as a set of spectral variants of the auxiliary kernel matrix, and the missing entries are estimated by fitting this model to the existing entries. {F}or model fitting, we adopt the em algorithm (distinguished from the {EM} algorithm of {D}empster et al., 1977) based on the information geometry of positive definite matrices. {W}e will report promising results on bacteria clustering experiments using two marker sequences: 16{S} and gyr{B}.}, pdf = {../local/Tsuda2003em.pdf}, file = {Tsuda2003em.pdf:local/Tsuda2003em.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.jmlr.org/papers/v4/tsuda03a.html} }
@article{Tsuda2002new, author = {K. Tsuda and M. Kawanabe and G. R{\"a}tsch and S. Sonnenburg and K.-R. M{\"u}ller}, title = {A new discriminative kernel from probabilistic models}, journal = {Neural {C}omputation}, year = {2002}, volume = {14}, pages = {2397--2414}, number = {10}, doi = {10.1162/08997660260293274}, pdf = {../local/Tsuda2002new.pdf}, file = {Tsuda2002new.pdf:local/Tsuda2002new.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1162/08997660260293274} }
@article{Tsuda2002Marginalized, author = {K. Tsuda and T. Kin and K. Asai}, title = {Marginalized {K}ernels for {B}iological {S}equences}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {S268--S275}, abstract = {Motivation: {K}ernel methods such as support vector machines require a kernel function between objects to be defined a priori. {S}everal works have been done to derive kernels from probability distributions, e.g., the {F}isher kernel. {H}owever, a general methodology to design a kernel is not fully developed. {R}esults: {W}e propose a reasonable way of designing a kernel when objects are generated from latent variable models (e.g., {HMM}). {F}irst of all, a joint kernel is designed for complete data which include both visible and hidden variables. {T}hen a marginalized kernel for visible data is obtained by taking the expectation with respect to hidden variables. {W}e will show that the {F}isher kernel is a special case of marginalized kernels, which gives another viewpoint to the {F}isher kernel theory. {A}lthough our approach can be applied to any object, we particularly derive several marginalized kernels useful for biological sequences (e.g., {DNA} and proteins). {T}he effectiveness of marginalized kernels is illustrated in the task of classifying bacterial gyrase subunit {B} (gyr{B}) amino acid sequences.}, comment = {Introduces the idea of marginalized kernel. Show that the Fisher kernel is a particular case of it, and modify it. Application to bacterial gyrB classification.}, pdf = {../local/Tsuda2002Marginalized.pdf}, file = {Tsuda2002Marginalized.pdf:local/Tsuda2002Marginalized.pdf:PDF}, keywords = {biosvm} }
@article{Tsuda2004Learning, author = {Tsuda, K. and Noble, W.S.}, title = {Learning kernels from biological networks by maximizing entropy}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {i326--i333}, abstract = {Motivation: {T}he diffusion kernel is a general method for computing pairwise distances among all nodes in a graph, based on the sum of weighted paths between each pair of nodes. {T}his technique has been used successfully, in conjunction with kernel-based learning methods, to draw inferences from several types of biological networks. {R}esults: {W}e show that computing the diffusion kernel is equivalent to maximizing the von {N}eumann entropy, subject to a global constraint on the sum of the {E}uclidean distances between nodes. {T}his global constraint allows for high variance in the pairwise distances. {A}ccordingly, we propose an alternative, locally constrained diffusion kernel, and we demonstrate that the resulting kernel allows for more accurate support vector machine prediction of protein functional classifications from metabolic and protein?protein interaction networks. {A}vailability: {S}upplementary results and data are available at noble.gs.washington.edu/proj/maxent}, comment = {Problem = multiclass classification of tumor cells from gene expression. Show that the one-versus-all approach of combining SVM yields the minimum number of classification errors on their Affymetrix data with 14 tumor types. In addition to not taking variability estimates of repeated measurements into account, this approach selects different relevant features (genes) for each binary classifier.}, doi = {10.1093/bioinformatics/bth906}, pdf = {../local/Tsuda2004Learning.pdf}, file = {Tsuda2004Learning.pdf:local/Tsuda2004Learning.pdf:PDF}, keywords = {learning-kernel graph-kernel biosvm}, owner = {vert}, url = {http://dx.doi.org/10.1093/bioinformatics/bth906} }
@article{Valentini2002Gene, author = {Valentini, G.}, title = {Gene expression data analysis of human lymphoma using support vector machines and output coding ensembles.}, journal = {Artif. {I}ntell. {M}ed.}, year = {2002}, volume = {26}, pages = {281-304}, number = {3}, month = {Nov}, abstract = {The large amount of data generated by {DNA} microarrays was originally analysed using unsupervised methods, such as clustering or self-organizing maps. {R}ecently supervised methods such as decision trees, dot-product support vector machines ({SVM}) and multi-layer perceptrons ({MLP}) have been applied in order to classify normal and tumoural tissues. {W}e propose methods based on non-linear {SVM} with polynomial and {G}aussian kernels, and output coding ({OC}) ensembles of learning machines to separate normal from malignant tissues, to classify different types of lymphoma and to analyse the role of sets of coordinately expressed genes in carcinogenic processes of lymphoid tissues. {U}sing gene expression data from "{L}ymphochip", a specialised {DNA} microarray developed at {S}tanford {U}niversity {S}chool of {M}edicine, we show that {SVM} can correctly separate normal from tumoural tissues, and {OC} ensembles can be successfully used to classify different types of lymphoma. {M}oreover, we identify a group of coordinately expressed genes related to the separation of two distinct subgroups inside diffuse large {B}-cell lymphoma ({DLBCL}), validating a previous {A}lizadeh's hypothesis about the existence of two distinct diseases inside {DLBCL}.}, doi = {10.1016/S0933-3657(02)00077}, pdf = {../local/Valentini2002Gene.pdf}, file = {Valentini2002Gene.pdf:local/Valentini2002Gene.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@inproceedings{Vert2002Support, author = {Vert, J.-P.}, title = {Support vector machine prediction of signal peptide cleavage site using a new class of kernels for strings}, booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002}, year = {2002}, editor = {R. B. Altman and A. K. Dunker and L. Hunter and K. Lauerdale and T. E. Klein}, pages = {649--660}, publisher = {World Scientific}, pdf = {../local/vert02.pdf}, file = {vert02.pdf:local/vert02.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://www.smi.stanford.edu/projects/helix/psb02/vert.pdf} }
@techreport{Vert2005Kernel, author = {Vert, J.-P.}, title = {Kernel methods in computational biology}, institution = {CNRS-HAL}, year = {2005}, number = {ccsd-00012124}, month = {Oct}, abstract = {Support vector machines and kernel methods are increasingly popular in genomics and computational biology, due to their good performance in real-world applications and strong modularity that makes them suitable to a wide range of problems, from the classification of tumors to the automatic annotation of proteins. {T}heir ability to work in high dimension, to process non-vectorial data, and the natural framework they provide to integrate heterogeneous data are particularly relevant to various problems arising in computational biology. {I}n this chapter we survey some of the most prominent applications published so far, highlighting the particular developments in kernel methods triggered by problems in biology, and mention a few promising research directions likely to expand in the future.}, pdf = {../local/Vert2005Kernel.pdf}, file = {Vert2005Kernel.pdf:local/Vert2005Kernel.pdf:PDF}, keywords = {biosvm}, url = {http://hal.ccsd.cnrs.fr/ccsd-00012124} }
@article{Vert2002tree, author = {Vert, J.-P.}, title = {A tree kernel to analyze phylogenetic profiles}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {S276--S284}, pdf = {../local/vert02b.pdf}, file = {vert02b.pdf:local/vert02b.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://cbio.ensmp.fr/~jvert/publi/ismb02/index.html} }
@inproceedings{Vert2003Graph-driven, author = {Vert, J.-P. and Kanehisa, M.}, title = {Graph-driven features extraction from microarray data using diffusion kernels and kernel {CCA}}, booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.}, year = {2003}, editor = {S. Becker and S. Thrun and K. Obermayer}, pages = {1449--1456}, publisher = {MIT Press}, pdf = {../local/Vert2003Graph-driven.pdf}, file = {Vert2003Graph-driven.pdf:local/Vert2003Graph-driven.pdf:PDF}, keywords = {biosvm} }
@article{Vert2003Extracting, author = {Vert, J.-P. and Kanehisa, M. }, title = {Extracting active pathways from gene expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {238ii-234ii}, abstract = {Motivation: {A} promising way to make sense out of gene expression profiles is to relate them to the activity of metabolic and signalling pathways. {E}ach pathway usually involves many genes, such as enzymes, which can themselves participate in many pathways. {T}he set of all known pathways can therefore be represented by a complex network of genes. {S}earching for regularities in the set of gene expression profiles with respect to the topology of this gene network is a way to automatically extract active pathways and their associated patterns of activity. {M}ethod: {W}e present a method to perform this task, which consists in encoding both the gene network and the set of profiles into two kernel functions, and performing a regularized form of canonical correlation analysis between the two kernels. {R}esults: {W}hen applied to publicly available expression data the method is able to extract biologically relevant expression patterns, as well as pathways with related activity.}, pdf = {../local/Vert2003Extracting.pdf}, file = {Vert2003Extracting.pdf:local/Vert2003Extracting.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_2/ii238} }
@techreport{Vert2002Graph-driven, author = {Vert, J.-P. and Kanehisa, M.}, title = {Graph-driven features extraction from microarray data}, institution = {Arxiv physics}, year = {2002}, number = {0206055}, keywords = {biosvm} }
@incollection{Vert2004Local, author = {Vert, J.-P. and Saigo, H. and Akutsu, T.}, title = {Local alignment kernels for biological sequences}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {131-154}, address = {The MIT Press, Cambridge, Massachussetts}, pdf = {../local/saigo.pdf:http\}, file = {saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@inproceedings{Vert2006Kernels, author = {Vert, J.-P. and Thurman, R. and Noble, W. S.}, title = {Kernels for gene regulatory regions}, booktitle = {Adv. {N}eural. {I}nform. {P}rocess {S}yst.}, year = {2006}, editor = {Y. Weiss and B. Sch\"{o}lkopf and J. Platt}, volume = {18}, pages = {1401-1408}, address = {Cambridge, MA}, publisher = {MIT Press}, keywords = {biosvm} }
@incollection{Vert2004primer, author = {Vert, J.-P. and Tsuda, K. and Sch{\"o}lkopf, B.}, title = {A primer on kernel methods}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {35-70}, keywords = {biosvm}, owner = {vert} }
@inproceedings{Vert2005Supervised, author = {Vert, J.-P. and Yamanishi, Y.}, title = {Supervised graph inference}, booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.}, year = {2005}, editor = {Saul, L. K. and Weiss, Y. and Bottou, L.}, volume = {17}, pages = {1433-1440}, publisher = {MIT Press, Cambridge, MA}, pdf = {../local/nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF;nips2004.pdf:http\}, file = {nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF;nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Vinayagam2004Applying, author = {Vinayagam, A. and König, R. and Moormann, J. and Schubert, F. and Eils, R. and Glatting, K.-H. and Suhai, S.}, title = {Applying {S}upport {V}ector {M}achines for {G}ene {O}ntology based gene function prediction.}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, pages = {116}, number = {1}, month = {Aug}, abstract = {B{ACKGROUND}: {T}he current progress in sequencing projects calls for rapid, reliable and accurate function assignments of gene products. {A} variety of methods has been designed to annotate sequences on a large scale. {H}owever, these methods can either only be applied for specific subsets, or their results are not formalised, or they do not provide precise confidence estimates for their predictions. {RESULTS}: {W}e have developed a large-scale annotation system that tackles all of these shortcomings. {I}n our approach, annotation was provided through {G}ene {O}ntology terms by applying multiple {S}upport {V}ector {M}achines ({SVM}) for the classification of correct and false predictions. {T}he general performance of the system was benchmarked with a large dataset. {A}n organism-wise cross-validation was performed to define confidence estimates, resulting in an average precision of 80\% for 74\% of all test sequences. {T}he validation results show that the prediction performance was organism-independent and could reproduce the annotation of other automated systems as well as high-quality manual annotations. {W}e applied our trained classification system to {X}enopus laevis sequences, yielding functional annotation for more than half of the known expressed genome. {C}ompared to the currently available annotation, we provided more than twice the number of contigs with good quality annotation, and additionally we assigned a confidence value to each predicted {GO} term. {CONCLUSIONS}: {W}e present a complete automated annotation system that overcomes many of the usual problems by applying a controlled vocabulary of {G}ene {O}ntology and an established classification method on large and well-described sequence data sets. {I}n a case study, the function for {X}enopus laevis contig sequences was predicted and the results are publicly available at ftp://genome.dkfz-heidelberg.de/pub/agd/gene_association.agd_{X}enopus.}, doi = {10.1186/1471-2105-5-116}, pdf = {../local/Vinayagam2004Applying.pdf}, file = {Vinayagam2004Applying.pdf:local/Vinayagam2004Applying.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-5-116}, url = {http://dx.doi.org/10.1186/1471-2105-5-116} }
@article{Vlahovicek2005SBASE, author = {Kristian Vlahovicek and László Kaján and Vilmos Agoston and Sándor Pongor}, title = {The {SBASE} domain sequence resource, release 12: prediction of protein domain-architecture using support vector machines.}, journal = {Nucleic {A}cids {R}es}, year = {2005}, volume = {33}, pages = {D223-5}, number = {Database issue}, month = {Jan}, abstract = {S{BASE} (http://www.icgeb.trieste.it/sbase) is an online resource designed to facilitate the detection of domain homologies based on sequence database search. {T}he present release of the {SBASE} {A} library of protein domain sequences contains 972,397 protein sequence segments annotated by structure, function, ligand-binding or cellular topology, clustered into 8547 domain groups. {SBASE} {B} contains 169,916 domain sequences clustered into 2526 less well-characterized groups. {D}omain prediction is based on an evaluation of database search results in comparison with a 'similarity network' of inter-sequence similarity scores, using support vector machines trained on similarity search results of known domains.}, doi = {10.1093/nar/gki112}, pdf = {../local/Vlahovicek2005SBASE.pdf}, file = {Vlahovicek2005SBASE.pdf:local/Vlahovicek2005SBASE.pdf:PDF}, keywords = {biosvm}, pii = {33/suppl_1/D223}, url = {http://dx.doi.org/10.1093/nar/gki112} }
@article{Wagner2003Protocols, author = {Wagner, M. and Naik, D. and Pothen, A.}, title = {Protocols for disease classification from mass spectrometry data.}, journal = {Proteomics}, year = {2003}, volume = {3}, pages = {1692-1698}, number = {9}, abstract = {We report our results in classifying protein matrix-assisted laser desorption/ionization-time of flight mass spectra obtained from serum samples into diseased and healthy groups. {W}e discuss in detail five of the steps in preprocessing the mass spectral data for biomarker discovery, as well as our criterion for choosing a small set of peaks for classifying the samples. {C}ross-validation studies with four selected proteins yielded misclassification rates in the 10-15% range for all the classification methods. {T}hree of these proteins or protein fragments are down-regulated and one up-regulated in lung cancer, the disease under consideration in this data set. {W}hen cross-validation studies are performed, care must be taken to ensure that the test set does not influence the choice of the peaks used in the classification. {M}isclassification rates are lower when both the training and test sets are used to select the peaks used in classification versus when only the training set is used. {T}his expectation was validated for various statistical discrimination methods when thirteen peaks were used in cross-validation studies. {O}ne particular classification method, a linear support vector machine, exhibited especially robust performance when the number of peaks was varied from four to thirteen, and when the peaks were selected from the training set alone. {E}xperiments with the samples randomly assigned to the two classes confirmed that misclassification rates were significantly higher in such cases than those observed with the true data. {T}his indicates that our findings are indeed significant. {W}e found closely matching masses in a database for protein expression in lung cancer for three of the four proteins we used to classify lung cancer. {D}ata from additional samples, increased experience with the performance of various preprocessing techniques, and affirmation of the biological roles of the proteins that help in classification, will strengthen our conclusions in the future.}, doi = {10.1002/pmic.200300519}, pdf = {../local/Wagner2003Protocols.pdf}, file = {Wagner2003Protocols.pdf:local/Wagner2003Protocols.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/pmic.200300519} }
@article{Wagner2004Computational, author = {Wagner, M. and Naik, D.N. and Pothen, A. and Kasukurti, S. and Devineni, R.R. and Adam, B.L. and Semmes, O.J. and Wright Jr, G.L.}, title = {Computational protein biomarker prediction: a case study for prostate cancer}, journal = {B{MC} {B}ioinformatics}, year = {2004}, volume = {5}, number = {26}, abstract = {Background {R}ecent technological advances in mass spectrometry pose challenges in computational mathematics and statistics to process the mass spectral data into predictive models with clinical and biological significance. {W}e discuss several classification-based approaches to finding protein biomarker candidates using protein profiles obtained via mass spectrometry, and we assess their statistical significance. {O}ur overall goal is to implicate peaks that have a high likelihood of being biologically linked to a given disease state, and thus to narrow the search for biomarker candidates. {R}esults {T}horough cross-validation studies and randomization tests are performed on a prostate cancer dataset with over 300 patients, obtained at the {E}astern {V}irginia {M}edical {S}chool using {SELDI}-{TOF} mass spectrometry. {W}e obtain average classification accuracies of 87% on a four-group classification problem using a two-stage linear {SVM}-based procedure and just 13 peaks, with other methods performing comparably. {C}onclusions {M}odern feature selection and classification methods are powerful techniques for both the identification of biomarker candidates and the related problem of building predictive models from protein mass spectrometric profiles. {C}ross-validation and randomization are essential tools that must be performed carefully in order not to bias the results unfairly. {H}owever, only a biological validation and identification of the underlying proteins will ultimately confirm the actual value and power of any computational predictions.}, doi = {10.1186/1471-2105-5-26}, pdf = {../local/Wagner2004Computational.pdf}, file = {Wagner2004Computational.pdf:local/Wagner2004Computational.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://www.biomedcentral.com/1471-2105/5/26} }
@article{Wang2005Protein, author = {Wang, J. and Sung, W.-K. and Krishnan, A. and Li, K.-B.}, title = {Protein subcellular localization prediction for {G}ram-negative bacteria using amino acid subalphabets and a combination of multiple support vector machines.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6}, pages = {174}, number = {1}, month = {Jul}, abstract = {B{ACKGROUND}: {P}redicting the subcellular localization of proteins is important for determining the function of proteins. {P}revious works focused on predicting protein localization in {G}ram-negative bacteria obtained good results. {H}owever, these methods had relatively low accuracies for the localization of extracellular proteins. {T}his paper studies ways to improve the accuracy for predicting extracellular localization in {G}ram-negative bacteria. {RESULTS}: {W}e have developed a system for predicting the subcellular localization of proteins for {G}ram-negative bacteria based on amino acid subalphabets and a combination of multiple support vector machines. {T}he recall of the extracellular site and overall recall of our predictor reach 86.0\% and 89.8\%, respectively, in 5-fold cross-validation. {T}o the best of our knowledge, these are the most accurate results for predicting subcellular localization in {G}ram-negative bacteria. {CONCLUSIONS}: {C}lustering 20 amino acids into a few groups by the proposed greedy algorithm provides a new way to extract features from protein sequences to cover more adjacent amino acids and hence reduce the dimensionality of the input vector of protein features. {I}t was observed that a good amino acid grouping leads to an increase in prediction performance. {F}urthermore, a proper choice of a subset of complementary support vector machines constructed by different features of proteins maximizes the prediction accuracy.}, doi = {10.1186/1471-2105-6-174}, pdf = {../local/Wang2005Protein.pdf}, file = {Wang2005Protein.pdf:local/Wang2005Protein.pdf:PDF}, keywords = {biosvm}, pii = {1471-2105-6-174}, url = {http://dx.doi.org/10.1186/1471-2105-6-174} }
@article{Wang2004Predicting, author = {Long-Hui Wang and Juan Liu and Yan-Fu Li and Huai-Bei Zhou}, title = {Predicting protein secondary structure by a support vector machine based on a new coding scheme.}, journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform}, year = {2004}, volume = {15}, pages = {181-90}, number = {2}, abstract = {Protein structure prediction is one of the most important problems in modern computational biology. {P}rotein secondary structure prediction is a key step in prediction of protein tertiary structure. {T}here have emerged many methods based on machine learning techniques, such as neural networks ({NN}) and support vector machine ({SVM}) etc., to focus on the prediction of the secondary structures. {I}n this paper, a new method was proposed based on {SVM}. {D}ifferent from the existing methods, this method takes into account of the physical-chemical properties and structure properties of amino acids. {W}hen tested on the most popular dataset {CB}513, it achieved a {Q}(3) accuracy of 0.7844, which illustrates that it is one of the top range methods for protein of secondary structure prediction.}, keywords = {biosvm}, url = {http://www.jsbi.org/journal/GIW04/GIW04F019.html} }
@article{Wang2005Using, author = {M. Wang and J. Yang and K-C. Chou}, title = {Using string kernel to predict signal peptide cleavage site based on subsite coupling model.}, journal = {Amino {A}cids}, year = {2005}, volume = {28}, pages = {395-402}, number = {4}, month = {Jun}, abstract = {Owing to the importance of signal peptides for studying the molecular mechanisms of genetic diseases, reprogramming cells for gene therapy, and finding new drugs for healing a specific defect, it is in great demand to develop a fast and accurate method to identify the signal peptides. {I}ntroduction of the so-called {-3,-1, +1} coupling model ({C}hou, {K}. {C}.: {P}rotein {E}ngineering, 2001, 14-2, 75-79) has made it possible to take into account the coupling effect among some key subsites and hence can significantly enhance the prediction quality of peptide cleavage site. {B}ased on the subsite coupling model, a kind of string kernels for protein sequence is introduced. {I}ntegrating the biologically relevant prior knowledge, the constructed string kernels can thus be used by any kernel-based method. {A} {S}upport vector machines ({SVM}) is thus built to predict the cleavage site of signal peptides from the protein sequences. {T}he current approach is compared with the classical weight matrix method. {A}t small false positive ratios, our method outperforms the classical weight matrix method, indicating the current approach may at least serve as a powerful complemental tool to other existing methods for predicting the signal peptide cleavage site.{T}he software that generated the results reported in this paper is available upon requirement, and will appear at http://www.pami.sjtu.edu.cn/wm.}, doi = {10.1007/s00726-005-0189-6}, pdf = {../local/Wang2005Using.pdf}, file = {Wang2005Using.pdf:local/Wang2005Using.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1007/s00726-005-0189-6} }
@article{Wang2004Weighted-support, author = {Wang, M. and Yang, J. and Liu, G.-P. and Xu, Z.-J. and Chou, K.-C.}, title = {Weighted-support vector machines for predicting membrane protein types based on pseudo-amino acid composition}, journal = {Protein {E}ng. {D}es. {S}el.}, year = {2004}, volume = {17}, pages = {509-516}, number = {6}, abstract = {Membrane proteins are generally classified into the following five types: (1) type {I} membrane proteins, (2) type {II} membrane proteins, (3) multipass transmembrane proteins, (4) lipid chain-anchored membrane proteins and (5) {GPI}-anchored membrane proteins. {P}rediction of membrane protein types has become one of the growing hot topics in bioinformatics. {C}urrently, we are facing two critical challenges in this area: first, how to take into account the extremely complicated sequence-order effects, and second, how to deal with the highly uneven sizes of the subsets in a training dataset. {I}n this paper, stimulated by the concept of using the pseudo-amino acid composition to incorporate the sequence-order effects, the spectral analysis technique is introduced to represent the statistical sample of a protein. {B}ased on such a framework, the weighted support vector machine ({SVM}) algorithm is applied. {T}he new approach has remarkable power in dealing with the bias caused by the situation when one subset in the training dataset contains many more samples than the other. {T}he new method is particularly useful when our focus is aimed at proteins belonging to small subsets. {T}he results obtained by the self-consistency test, jackknife test and independent dataset test are encouraging, indicating that the current approach may serve as a powerful complementary tool to other existing methods for predicting the types of membrane proteins.}, doi = {10.1093/protein/gzh061}, eprint = {http://peds.oupjournals.org/cgi/reprint/17/6/509.pdf}, pdf = {../local/Wang2004Weighted-support.pdf}, file = {Wang2004Weighted-support.pdf:local/Wang2004Weighted-support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1093/protein/gzh061} }
@article{Wang2004Support, author = {M-L. Wang and W-J. Li and M-L. Wang and W-B. Xu}, title = {Support vector machines for prediction of peptidyl prolyl cis/trans isomerization.}, journal = {J {P}ept {R}es}, year = {2004}, volume = {63}, pages = {23-8}, number = {1}, month = {Jan}, abstract = {A new method for peptidyl prolyl cis/trans isomerization prediction based on the theory of support vector machines ({SVM}) was introduced. {T}he {SVM} represents a new approach to supervised pattern classification and has been successfully applied to a wide range of pattern recognition problems. {I}n this study, six training datasets consisting of different length local sequence respectively were used. {T}he polynomial kernel functions with different parameter d were chosen. {T}he test for the independent testing dataset and the jackknife test were both carried out. {W}hen the local sequence length was 20-residue and the parameter d = 8, the {SVM} method archived the best performance with the correct rate for the cis and trans forms reaching 70.4 and 69.7\% for the independent testing dataset, 76.7 and 76.6\% for the jackknife test, respectively. {M}atthew's correlation coefficients for the jackknife test could reach about 0.5. {T}he results obtained through this study indicated that the {SVM} method would become a powerful tool for predicting peptidyl prolyl cis/trans isomerization.}, keywords = {biosvm}, pii = {100} }
@article{Wang2005Prediction, author = {Ming-Lei Wang and Hui Yao and Wen-Bo Xu}, title = {Prediction by support vector machines and analysis by {Z}-score of poly-{L}-proline type {II} conformation based on local sequence.}, journal = {Comput. {B}iol. {C}hem.}, year = {2005}, volume = {29}, pages = {95-100}, number = {2}, month = {Apr}, abstract = {In recent years, the poly-{L}-proline type {II} ({PPII}) conformation has gained more and more importance. {T}his structure plays vital roles in many biological processes. {B}ut few studies have been made to predict {PPII} secondary structures computationally. {T}he support vector machine ({SVM}) represents a new approach to supervised pattern classification and has been successfully applied to a wide range of pattern recognition problems. {I}n this paper, we present a {SVM} prediction method of {PPII} conformation based on local sequence. {T}he overall accuracy for both the independent testing set and estimate of jackknife testing reached approximately 70\%. {M}atthew's correlation coefficient ({MCC}) could reach 0.4. {B}y comparing the results of training and testing datasets with different sequence identities, we suggest that the performance of this method correlates with the sequence identity of dataset. {T}he parameter of {SVM} kernel function was an important factor to the performance of this method. {T}he propensities of residues located at different positions were also analyzed. {B}y computing {Z}-scores, we found that {P} and {G} were the two most important residues to {PPII} structure conformation.}, doi = {10.1016/j.compbiolchem.2005.02.002}, pdf = {../local/Wang2005Prediction.pdf}, file = {Wang2005Prediction.pdf:local/Wang2005Prediction.pdf:PDF}, keywords = {biosvm}, pii = {S1476-9271(05)00017-4}, url = {http://dx.doi.org/10.1016/j.compbiolchem.2005.02.002} }
@article{Wang2005Gene, author = {Yu Wang and Igor V Tetko and Mark A Hall and Eibe Frank and Axel Facius and Klaus F X Mayer and Hans W Mewes}, title = {Gene selection from microarray data for cancer classification--a machine learning approach.}, journal = {Comput. {B}iol. {C}hem.}, year = {2005}, volume = {29}, pages = {37-46}, number = {1}, month = {Feb}, abstract = {A {DNA} microarray can track the expression levels of thousands of genes simultaneously. {P}revious research has demonstrated that this technology can be useful in the classification of cancers. {C}ancer microarray data normally contains a small number of samples which have a large number of gene expression levels as features. {T}o select relevant genes involved in different types of cancer remains a challenge. {I}n order to extract useful gene information from cancer microarray data and reduce dimensionality, feature selection algorithms were systematically investigated in this study. {U}sing a correlation-based feature selector combined with machine learning algorithms such as decision trees, naïve {B}ayes and support vector machines, we show that classification performance at least as good as published results can be obtained on acute leukemia and diffuse large {B}-cell lymphoma microarray data sets. {W}e also demonstrate that a combined use of different classification and feature selection approaches makes it possible to select relevant genes with high confidence. {T}his is also the first paper which discusses both computational and biological evidence for the involvement of zyxin in leukaemogenesis.}, doi = {10.1016/j.compbiolchem.2004.11.001}, pdf = {../local/Wang2005Gene.pdf}, file = {Wang2005Gene.pdf:local/Wang2005Gene.pdf:PDF}, keywords = {biosvm microarray}, pii = {S1476-9271(04)00108-2}, url = {http://dx.doi.org/10.1016/j.compbiolchem.2004.11.001} }
@article{Ward2003Secondary, author = {Ward, J. J. and McGuffin, L. J. and Buxton, B. F. and Jones, D. T.}, title = {Secondary structure prediction with support vector machines}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1650-1655}, number = {13}, abstract = {Motivation: {A} new method that uses support vector machines ({SVM}s) to predict protein secondary structure is described and evaluated. {T}he study is designed to develop a reliable prediction method using an alternative technique and to investigate the applicability of {SVM}s to this type of bioinformatics problem. {M}ethods: {B}inary {SVM}s are trained to discriminate between two structural classes. {T}he binary classifiers are combined in several ways to predict multi-class secondary structure. {R}esults: {T}he average three-state prediction accuracy per protein ({Q}3) is estimated by cross-validation to be 77.07 {+/-} 0.26% with a segment overlap ({S}ov) score of 73.32 {+/-} 0.39%. {T}he {SVM} performs similarly to the 'state-of-the-art' {PSIPRED} prediction method on a non-homologous test set of 121 proteins despite being trained on substantially fewer examples. {A} simple consensus of the {SVM}, {PSIPRED} and {PROF}sec achieves significantly higher prediction accuracy than the individual methods. {A}vailability: {T}he {SVM} classifier is available from the authors. {W}ork is in progress to make the method available on-line and to integrate the {SVM} predictions into the {PSIPRED} server.}, pdf = {../local/Ward2003Secondary.pdf}, file = {Ward2003Secondary.pdf:local/Ward2003Secondary.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1650} }
@article{Waring2004Interlaboratory, author = {Jeffrey F Waring and Roger G Ulrich and Nick Flint and David Morfitt and Arno Kalkuhl and Frank Staedtler and Michael Lawton and Johanna M Beekman and Laura Suter}, title = {Interlaboratory evaluation of rat hepatic gene expression changes induced by methapyrilene.}, journal = {Environ {H}ealth {P}erspect}, year = {2004}, volume = {112}, pages = {439-48}, number = {4}, month = {Mar}, abstract = {Several studies using microarrays have shown that changes in gene expression provide information about the mechanism of toxicity induced by xenobiotic agents. {N}evertheless, the issue of whether gene expression profiles are reproducible across different laboratories remains to be determined. {T}o address this question, several members of the {H}epatotoxicity {W}orking {G}roup of the {I}nternational {L}ife {S}ciences {I}nstitute {H}ealth and {E}nvironmental {S}ciences {I}nstitute evaluated the liver gene expression profiles of rats treated with methapyrilene ({MP}). {A}nimals were treated at one facility, and {RNA} was distributed to five different sites for gene expression analysis. {A} preliminary evaluation of the number of modulated genes uncovered striking differences between the five different sites. {H}owever, additional data analysis demonstrated that these differences had an effect on the absolute gene expression results but not on the outcome of the study. {F}or all users, unsupervised algorithms showed that gene expression allows the distinction of the high dose of {MP} from controls and low dose. {I}n addition, the use of a supervised analysis method (support vector machines) made it possible to correctly classify samples. {I}n conclusion, the results show that, despite some variability, robust gene expression changes were consistent between sites. {I}n addition, key expression changes related to the mechanism of {MP}-induced hepatotoxicity were identified. {T}hese results provide critical information regarding the consistency of microarray results across different laboratories and shed light on the strengths and limitations of expression profiling in drug safety analysis.}, keywords = {biosvm} }
@article{Warmuth2003Active, author = {Warmuth, M. K. and Liao, J. and R{\"a}tsch, G. and Mathieson, M. and Putta, S. and Lemmen, C.}, title = {Active learning with support vector machines in the drug discovery process.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {667-673}, number = {2}, abstract = {We investigate the following data mining problem from computer-aided drug design: {F}rom a large collection of compounds, find those that bind to a target molecule in as few iterations of biochemical testing as possible. {I}n each iteration a comparatively small batch of compounds is screened for binding activity toward this target. {W}e employed the so-called "active learning paradigm" from {M}achine {L}earning for selecting the successive batches. {O}ur main selection strategy is based on the maximum margin hyperplane-generated by "{S}upport {V}ector {M}achines". {T}his hyperplane separates the current set of active from the inactive compounds and has the largest possible distance from any labeled compound. {W}e perform a thorough comparative study of various other selection strategies on data sets provided by {D}u{P}ont {P}harmaceuticals and show that the strategies based on the maximum margin hyperplane clearly outperform the simpler ones.}, doi = {10.1021/ci025620t}, pdf = {../local/Warmuth2003Active.pdf}, file = {Warmuth2003Active.pdf:local/Warmuth2003Active.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1021/ci025620t} }
@inproceedings{Warmuth2002Active, author = {Warmuth, M. K. and R{\"a}tsch, G. and Mathieson, M. and Liao, L. and Lemmen, C.}, title = {Active learning in the drug discovery process}, booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.}, year = {2002}, editor = {T.G. Dietterich and S. Becker and Z. Ghahramani}, volume = {14}, pages = {1449--1456}, publisher = {MIT Press}, keywords = {biosvm}, subject = {qsar} }
@incollection{Watkins2000Dynamic, author = {C. Watkins}, title = {Dynamic alignment kernels}, booktitle = {Advances in {L}arge {M}argin {C}lassifiers}, publisher = {MIT Press}, year = {2000}, editor = {A.J. Smola and P.L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans}, pages = {39--50}, address = {Cambridge, MA}, pdf = {../local/Watkins2000Dynamic.pdf}, file = {Watkins2000Dynamic.pdf:local/Watkins2000Dynamic.pdf:PDF}, keywords = {biosvm}, subject = {kernel}, url = {http://www.cs.rhbnc.ac.uk/home/chrisw/dynk.ps.gz} }
@article{Weathers2004Reduced, author = {Weathers, E. A. and Paulaitis, M. E. and Woolf, T. B. and Hoh, J. H.}, title = {Reduced amino acid alphabet is sufficient to accurately recognize intrinsically disordered protein.}, journal = {F{EBS} {L}ett.}, year = {2004}, volume = {576}, pages = {348-352}, number = {3}, abstract = {Intrinsically disordered proteins are an important class of proteins with unique functions and properties. {H}ere, we have applied a support vector machine ({SVM}) trained on naturally occurring disordered and ordered proteins to examine the contribution of various parameters (vectors) to recognizing proteins that contain disordered regions. {W}e find that a {SVM} that incorporates only amino acid composition has a recognition accuracy of 87+/-2%. {T}his result suggests that composition alone is sufficient to accurately recognize disorder. {I}nterestingly, {SVM}s using reduced sets of amino acids based on chemical similarity preserve high recognition accuracy. {A} set as small as four retains an accuracy of 84+/-2%; this suggests that general physicochemical properties rather than specific amino acids are important factors contributing to protein disorder.}, doi = {10.1016/j.febslet.2004.09.036}, pdf = {../local/Weathers2004Reduced.pdf}, file = {Weathers2004Reduced.pdf:local/Weathers2004Reduced.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1016/j.febslet.2004.09.036} }
@article{Weston2003Feature, author = {Weston, J. and P{\'e}rez-Cruz, F. and Bousquet, O. and Chapelle, O. and Elisseeff, A. and Sch{\"o}lkopf, B.}, title = {Feature selection and transduction for prediction of molecular bioactivity for drug design}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {764-771}, number = {6}, abstract = {Motivation: {I}n drug discovery a key task is to identify characteristics that separate active (binding) compounds from inactive (non-binding) ones. {A}n automated prediction system can help reduce resources necessary to carry out this task. {R}esults: {T}wo methods for prediction of molecular bioactivity for drug design are introduced and shown to perform well in a data set previously studied as part of the {KDD} ({K}nowledge {D}iscovery and {D}ata {M}ining) {C}up 2001. {T}he data is characterized by very few positive examples, a very large number of features (describing three-dimensional properties of the molecules) and rather different distributions between training and test data. {T}wo techniques are introduced specifically to tackle these problems: a feature selection method for unbalanced data and a classifier which adapts to the distribution of the the unlabeled test data (a so-called transductive method). {W}e show both techniques improve identification performance and in conjunction provide an improvement over using only one of the techniques. {O}ur results suggest the importance of taking into account the characteristics in this data which may also be relevant in other problems of a similar type. {A}vailability: {M}atlab source code is available at http://www.kyb.tuebingen.mpg.de/bs/people/weston/kdd/kdd.html {C}ontact: jason.weston@tuebingen.mpg.de {S}upplementary information: {S}upplementary material is available at http://www.kyb.tuebingen.mpg.de/bs/people/weston/kdd/kdd.html.}, pdf = {../local/Weston2003Feature.pdf}, file = {Weston2003Feature.pdf:local/Weston2003Feature.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/6/764} }
@article{Williams2004Prognostic, author = {Williams, R.D. and Hing, S.N. and Greer, B.T. and Whiteford, C.C. and Wei, J.S. and Natrajan, R. and Kelsey, A. and Rogers, S. and Campbell, C. and Pritchard-Jones, K. and Khan, J.}, title = {Prognostic classification of relapsing favorable histology {W}ilms tumor using c{DNA} microarray expression profiling and support vector machines.}, journal = {Genes {C}hromosomes {C}ancer}, year = {2004}, volume = {41}, pages = {65-79}, number = {1}, month = {Sep}, abstract = {Treatment of {W}ilms tumor has a high success rate, with some 85% of patients achieving long-term survival. {H}owever, late effects of treatment and management of relapse remain significant clinical problems. {I}f accurate prognostic methods were available, effective risk-adapted therapies could be tailored to individual patients at diagnosis. {F}ew molecular prognostic markers for {W}ilms tumor are currently defined, though previous studies have linked allele loss on 1p or 16q, genomic gain of 1q, and overexpression from 1q with an increased risk of relapse. {T}o identify specific patterns of gene expression that are predictive of relapse, we used high-density (30 k) c{DNA} microarrays to analyze {RNA} samples from 27 favorable histology {W}ilms tumors taken from primary nephrectomies at the time of initial diagnosis. {T}hirteen of these tumors relapsed within 2 years. {G}enes differentially expressed between the relapsing and nonrelapsing tumor classes were identified by statistical scoring (t test). {T}hese genes encode proteins with diverse molecular functions, including transcription factors, developmental regulators, apoptotic factors, and signaling molecules. {U}se of a support vector machine classifier, feature selection, and test evaluation using cross-validation led to identification of a generalizable expression signature, a small subset of genes whose expression potentially can be used to predict tumor outcome in new samples. {S}imilar methods were used to identify genes that are differentially expressed between tumors with and without genomic 1q gain. {T}his set of discriminators was highly enriched in genes on 1q, indicating close agreement between data obtained from expression profiling with data from genomic copy number analyses.}, doi = {10.1002/gcc.20060Â }, pdf = {../local/Williams2004Prognostic.pdf}, file = {Williams2004Prognostic.pdf:local/Williams2004Prognostic.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/gcc.20060} }
@article{Wilton2003Comparison, author = {D. Wilton and P. Willett and K. Lawson and G. Mullier}, title = {Comparison of ranking methods for virtual screening in lead-discovery programs.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {469-74}, number = {2}, abstract = {This paper discusses the use of several rank-based virtual screening methods for prioritizing compounds in lead-discovery programs, given a training set for which both structural and bioactivity data are available. {S}tructures from the {NCI} {AIDS} data set and from the {S}yngenta corporate database were represented by two types of fragment bit-string and by sets of high-level molecular features. {T}hese representations were processed using binary kernel discrimination, similarity searching, substructural analysis, support vector machine, and trend vector analysis, with the effectiveness of the methods being judged by the extent to which active test set molecules were clustered toward the top of the resultant rankings. {T}he binary kernel discrimination approach yielded consistently superior rankings and would appear to have considerable potential for chemical screening applications.}, doi = {10.1021/ci025586i}, pdf = {../local/Wilton2003Comparison.pdf}, file = {Wilton2003Comparison.pdf:local/Wilton2003Comparison.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci025586i} }
@article{Winters-Hilt2003Highly, author = {Winters-Hilt, S. and Vercoutere, W. and DeGuzman, V.S. and Deamer, D. and Akeson, M. and Haussler, D.}, title = {Highly accurate classification of {W}atson-{C}rick basepairs on termini of single {DNA} molecules.}, journal = {Biophys. {J}.}, year = {2003}, volume = {84}, pages = {967-976}, number = {2}, abstract = {We introduce a computational method for classification of individual {DNA} molecules measured by an{alpha} -hemolysin channel detector. {W}e show classification with better than 99% accuracy for {DNA} hairpin molecules that differ only in their terminal {W}atson-{C}rick basepairs. {S}ignal classification was done in silico to establish performance metrics (i.e., where train and test data were of known type, via single-species data files). {I}t was then performed in solution to assay real mixtures of {DNA} hairpins. {H}idden {M}arkov {M}odels ({HMM}s) were used with {E}xpectation/{M}aximization for denoising and for associating a feature vector with the ionic current blockade of the {DNA} molecule. {S}upport {V}ector {M}achines ({SVM}s) were used as discriminators, and were the focus of off-line training. {A} multiclass {SVM} architecture was designed to place less discriminatory load on weaker discriminators, and novel {SVM} kernels were used to boost discrimination strength. {T}he tuning on {HMM}s and {SVM}s enabled biophysical analysis of the captured molecule states and state transitions; structure revealed in the biophysical analysis was used for better feature selection.}, pdf = {../local/Winters-Hilt2003Highly.pdf}, file = {Winters-Hilt2003Highly.pdf:local/Winters-Hilt2003Highly.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.biophysj.org/cgi/content/abstract/84/2/967} }
@article{Wu2003Comparison, author = {Wu, B. and Abbott, T. and Fishman, D. and McMurray, W. and Mor, G. and Stone, K. and Ward, D. and Williams, K. and Zhao, H.}, title = {Comparison of statistical methods for classification of ovarian cancer using mass spectrometry data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1636-1643}, number = {13}, abstract = {Motivation: {N}ovel methods, both molecular and statistical, are urgently needed to take advantage of recent advances in biotechnology and the human genome project for disease diagnosis and prognosis. {M}ass spectrometry ({MS}) holds great promise for biomarker identification and genome-wide protein profiling. {I}t has been demonstrated in the literature that biomarkers can be identified to distinguish normal individuals from cancer patients using {MS} data. {S}uch progress is especially exciting for the detection of early-stage ovarian cancer patients. {A}lthough various statistical methods have been utilized to identify biomarkers from {MS} data, there has been no systematic comparison among these approaches in their relative ability to analyze {MS} data. {R}esults: {W}e compare the performance of several classes of statistical methods for the classification of cancer based on {MS} spectra. {T}hese methods include: linear discriminant analysis, quadratic discriminant analysis, k-nearest neighbor classifier, bagging and boosting classification trees, support vector machine, and random forest ({RF}). {T}he methods are applied to ovarian cancer and control serum samples from the {N}ational {O}varian {C}ancer {E}arly {D}etection {P}rogram clinic at {N}orthwestern {U}niversity {H}ospital. {W}e found that {RF} outperforms other methods in the analysis of {MS} data. {S}upplementary information: http://bioinformatics.med.yale.edu/proteomics/{B}io{S}upp1.html}, pdf = {../local/Wu2003Comparison.pdf}, file = {Wu2003Comparison.pdf:local/Wu2003Comparison.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1636} }
@article{Xie2005LOCSVMPSI, author = {Dan Xie and Ao Li and Minghui Wang and Zhewen Fan and Huanqing Feng}, title = {L{OCSVMPSI}: a web server for subcellular localization of eukaryotic proteins using {SVM} and profile of {PSI}-{BLAST}.}, journal = {Nucleic {A}cids {R}es.}, year = {2005}, volume = {33}, pages = {W105-10}, number = {Web Server issue}, month = {Jul}, abstract = {Subcellular location of a protein is one of the key functional characters as proteins must be localized correctly at the subcellular level to have normal biological function. {I}n this paper, a novel method named {LOCSVMPSI} has been introduced, which is based on the support vector machine ({SVM}) and the position-specific scoring matrix generated from profiles of {PSI}-{BLAST}. {W}ith a jackknife test on the {RH}2427 data set, {LOCSVMPSI} achieved a high overall prediction accuracy of 90.2\%, which is higher than the prediction results by {S}ub{L}oc and {ESL}pred on this data set. {I}n addition, prediction performance of {LOCSVMPSI} was evaluated with 5-fold cross validation test on the {PK}7579 data set and the prediction results were consistently better than the previous method based on several {SVM}s using composition of both amino acids and amino acid pairs. {F}urther test on the {SWISSPROT} new-unique data set showed that {LOCSVMPSI} also performed better than some widely used prediction methods, such as {PSORTII}, {T}arget{P} and {LOC}net. {A}ll these results indicate that {LOCSVMPSI} is a powerful tool for the prediction of eukaryotic protein subcellular localization. {A}n online web server (current version is 1.3) based on this method has been developed and is freely available to both academic and commercial users, which can be accessed by at http://{B}ioinformatics.ustc.edu.cn/{LOCSVMPSI}/{LOCSVMPSI}.php.}, doi = {10.1093/nar/gki359}, pdf = {../local/Xie2005LOCSVMPSI.pdf}, file = {Xie2005LOCSVMPSI.pdf:local/Xie2005LOCSVMPSI.pdf:PDF}, keywords = {biosvm}, pii = {33/suppl_2/W105}, url = {http://dx.doi.org/10.1093/nar/gki359} }
@article{Xiong2001Biomarker, author = {Xiong, M. and Fang, X. and Zhao, J.}, title = {Biomarker {I}dentification by {F}eature {W}rappers}, journal = {Genome {R}es.}, year = {2001}, volume = {11}, pages = {1878-1887}, number = {11}, abstract = {Gene expression studies bridge the gap between {DNA} information and trait information by dissecting biochemical pathways into intermediate components between genotype and phenotype. {T}hese studies open new avenues for identifying complex disease genes and biomarkers for disease diagnosis and for assessing drug efficacy and toxicity. {H}owever, the majority of analytical methods applied to gene expression data are not efficient for biomarker identification and disease diagnosis. {I}n this paper, we propose a general framework to incorporate feature (gene) selection into pattern recognition in the process to identify biomarkers. {U}sing this framework, we develop three feature wrappers that search through the space of feature subsets using the classification error as measure of goodness for a particular feature subset being "wrapped around": linear discriminant analysis, logistic regression, and support vector machines. {T}o effectively carry out this computationally intensive search process, we employ sequential forward search and sequential forward floating search algorithms. {T}o evaluate the performance of feature selection for biomarker identification we have applied the proposed methods to three data sets. {T}he preliminary results demonstrate that very high classification accuracy can be attained by identified composite classifiers with several biomarkers.}, pdf = {../local/Xiong2001Biomarker.pdf}, file = {Xiong2001Biomarker.pdf:local/Xiong2001Biomarker.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.genome.org/cgi/content/abstract/11/11/1878} }
@article{Xu2004Molecular, author = {Xiu-Qin Xu and Chon K Leow and Xin Lu and Xuegong Zhang and Jun S Liu and Wing-Hung Wong and Arndt Asperger and Sören Deininger and Hon-Chiu Eastwood Leung}, title = {Molecular classification of liver cirrhosis in a rat model by proteomics and bioinformatics.}, journal = {Proteomics}, year = {2004}, volume = {4}, pages = {3235-45}, number = {10}, month = {Oct}, abstract = {Liver cirrhosis is a worldwide health problem. {R}eliable, noninvasive methods for early detection of liver cirrhosis are not available. {U}sing a three-step approach, we classified sera from rats with liver cirrhosis following different treatment insults. {T}he approach consisted of: (i) protein profiling using surface-enhanced laser desorption/ionization ({SELDI}) technology; (ii) selection of a statistically significant serum biomarker set using machine learning algorithms; and (iii) identification of selected serum biomarkers by peptide sequencing. {W}e generated serum protein profiles from three groups of rats: (i) normal (n=8), (ii) thioacetamide-induced liver cirrhosis (n=22), and (iii) bile duct ligation-induced liver fibrosis (n=5) using a weak cation exchanger surface. {P}rofiling data were further analyzed by a recursive support vector machine algorithm to select a panel of statistically significant biomarkers for class prediction. {S}ensitivity and specificity of classification using the selected protein marker set were higher than 92\%. {A} consistently down-regulated 3495 {D}a protein in cirrhosis samples was one of the selected significant biomarkers. {T}his 3495 {D}a protein was purified on-chip and trypsin digested. {F}urther structural characterization of this biomarkers candidate was done by using cross-platform matrix-assisted laser desorption/ionization mass spectrometry ({MALDI}-{MS}) peptide mass fingerprinting ({PMF}) and matrix-assisted laser desorption/ionization time of flight/time of flight ({MALDI}-{TOF}/{TOF}) tandem mass spectrometry ({MS}/{MS}). {C}ombined data from {PMF} and {MS}/{MS} spectra of two tryptic peptides suggested that this 3495 {D}a protein shared homology to a histidine-rich glycoprotein. {T}hese results demonstrated a novel approach to discovery of new biomarkers for early detection of liver cirrhosis and classification of liver diseases.}, doi = {10.1002/pmic.200400839}, pdf = {../local/Xu2004Molecular.pdf}, file = {Xu2004Molecular.pdf:Xu2004Molecular.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1002/pmic.200400839} }
@article{Xue2004Support, author = {C. X. Xue and R. S. Zhang and H. X. Liu and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Support vector machines-based quantitative structure-property relationship for the prediction of heat capacity.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1267-74}, number = {4}, abstract = {The support vector machine ({SVM}), as a novel type of learning machine, for the first time, was used to develop a {Q}uantitative {S}tructure-{P}roperty {R}elationship ({QSPR}) model of the heat capacity of a diverse set of 182 compounds based on the molecular descriptors calculated from the structure alone. {M}ultiple linear regression ({MLR}) and radial basis function networks ({RBFNN}s) were also utilized to construct quantitative linear and nonlinear models to compare with the results obtained by {SVM}. {T}he root-mean-square (rms) errors in heat capacity predictions for the whole data set given by {MLR}, {RBFNN}s, and {SVM} were 4.648, 4.337, and 2.931 heat capacity units, respectively. {T}he prediction results are in good agreement with the experimental value of heat capacity; also, the results reveal the superiority of the {SVM} over {MLR} and {RBFNN}s models.}, doi = {10.1021/ci049934n}, pdf = {../local/Xue2004Support.pdf}, file = {Xue2004Support.pdf:local/Xue2004Support.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049934n} }
@article{Xue2004accurate, author = {C. X. Xue and R. S. Zhang and H. X. Liu and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {An accurate {QSPR} study of {O}-{H} bond dissociation energy in substituted phenols based on support vector machines.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {669-77}, number = {2}, abstract = {The support vector machine ({SVM}), as a novel type of learning machine, was used to develop a {Q}uantitative {S}tructure-{P}roperty {R}elationship ({QSPR}) model of the {O}-{H} bond dissociation energy ({BDE}) of 78 substituted phenols. {T}he six descriptors calculated solely from the molecular structures of compounds selected by forward stepwise regression were used as inputs for the {SVM} model. {T}he root-mean-square (rms) errors in {BDE} predictions for the training, test, and overall data sets were 3.808, 3.320, and 3.713 {BDE} units (k{J} mol(-1)), respectively. {T}he results obtained by {G}aussian-kernel {SVM} were much better than those obtained by multiple linear regression, radial basis function neural networks, linear-kernel {SVM}, and other {QSPR} approaches.}, doi = {10.1021/ci034248u}, pdf = {../local/Xue2004accurate.pdf}, file = {Xue2004accurate.pdf:local/Xue2004accurate.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci034248u} }
@article{Xue2004QSAR, author = {C. X. Xue and R. S. Zhang and H. X. Liu and X. J. Yao and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Q{SAR} models for the prediction of binding affinities to human serum albumin using the heuristic method and a support vector machine.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1693-700}, number = {5}, abstract = {The binding affinities to human serum albumin for 94 diverse drugs and drug-like compounds were modeled with the descriptors calculated from the molecular structure alone using a quantitative structure-activity relationship ({QSAR}) technique. {T}he heuristic method ({HM}) and support vector machine ({SVM}) were utilized to construct the linear and nonlinear prediction models, leading to a good correlation coefficient ({R}2) of 0.86 and 0.94 and root-mean-square errors (rms) of 0.212 and 0.134 albumin drug binding affinity units, respectively. {F}urthermore, the models were evaluated by a 10 compound external test set, yielding {R}2 of 0.71 and 0.89 and rms error of 0.430 and 0.222. {T}he specific information described by the heuristic linear model could give some insights into the factors that are likely to govern the binding affinity of the compounds and be used as an aid to the drug design process; however, the prediction results of the nonlinear {SVM} model seem to be better than that of the {HM}.}, doi = {10.1021/ci049820b}, pdf = {../local/Xue2004QSAR.pdf}, file = {Xue2004QSAR.pdf:local/Xue2004QSAR.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049820b} }
@article{Xue2004Study, author = {C. X. Xue and R. S. Zhang and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Study of the quantitative structure-mobility relationship of carboxylic acids in capillary electrophoresis based on support vector machines.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {950-7}, number = {3}, abstract = {The support vector machines ({SVM}), as a novel type of learning machine, were used to develop a quantitative structure-mobility relationship ({QSMR}) model of 58 aliphatic and aromatic carboxylic acids based on molecular descriptors calculated from the structure alone. {M}ultiple linear regression ({MLR}) and radial basis function neural networks ({RBFNN}s) were also utilized to construct the linear and the nonlinear model to compare with the results obtained by {SVM}. {T}he root-mean-square errors in absolute mobility predictions for the whole data set given by {MLR}, {RBFNN}s, and {SVM} were 1.530, 1.373, and 0.888 mobility units (10(-5) cm(2) {S}(-1) {V}(-1)), respectively, which indicated that the prediction result agrees well with the experimental values of these compounds and also revealed the superiority of {SVM} over {MLR} and {RBFNN}s models for the prediction of the absolute mobility of carboxylic acids. {M}oreover, the models we proposed could also provide some insight into what structural features are related to the absolute mobility of aliphatic and aromatic carboxylic acids.}, doi = {10.1021/ci034280o}, pdf = {../local/Xue2004Study.pdf}, file = {Xue2004Study.pdf:local/Xue2004Study.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci034280o} }
@article{Xue2004Effect, author = {Y. Xue and Z. R. Li and C. W. Yap and L. Z. Sun and X. Chen and Y. Z. Chen}, title = {Effect of molecular descriptor feature selection in support vector machine classification of pharmacokinetic and toxicological properties of chemical agents.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1630-8}, number = {5}, abstract = {Statistical-learning methods have been developed for facilitating the prediction of pharmacokinetic and toxicological properties of chemical agents. {T}hese methods employ a variety of molecular descriptors to characterize structural and physicochemical properties of molecules. {S}ome of these descriptors are specifically designed for the study of a particular type of properties or agents, and their use for other properties or agents might generate noise and affect the prediction accuracy of a statistical learning system. {T}his work examines to what extent the reduction of this noise can improve the prediction accuracy of a statistical learning system. {A} feature selection method, recursive feature elimination ({RFE}), is used to automatically select molecular descriptors for support vector machines ({SVM}) prediction of {P}-glycoprotein substrates ({P}-gp), human intestinal absorption of molecules ({HIA}), and agents that cause torsades de pointes ({T}d{P}), a rare but serious side effect. {RFE} significantly reduces the number of descriptors for each of these properties thereby increasing the computational speed for their classification. {T}he {SVM} prediction accuracies of {P}-gp and {HIA} are substantially increased and that of {T}d{P} remains unchanged by {RFE}. {T}hese prediction accuracies are comparable to those of earlier studies derived from a selective set of descriptors. {O}ur study suggests that molecular feature selection is useful for improving the speed and, in some cases, the accuracy of statistical learning methods for the prediction of pharmacokinetic and toxicological properties of chemical agents.}, doi = {10.1021/ci049869h}, pdf = {../local/Xue2004Effect.pdf}, file = {Xue2004Effect.pdf:local/Xue2004Effect.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049869h} }
@article{Xue2004Prediction, author = {Y. Xue and C. W. Yap and L. Z. Sun and Z. W. Cao and J. F. Wang and Y. Z. Chen}, title = {Prediction of {P}-glycoprotein substrates by a support vector machine approach.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1497-505}, number = {4}, abstract = {P-glycoproteins ({P}-gp) actively transport a wide variety of chemicals out of cells and function as drug efflux pumps that mediate multidrug resistance and limit the efficacy of many drugs. {M}ethods for facilitating early elimination of potential {P}-gp substrates are useful for facilitating new drug discovery. {A} computational ensemble pharmacophore model has recently been used for the prediction of {P}-gp substrates with a promising accuracy of 63\%. {I}t is desirable to extend the prediction range beyond compounds covered by the known pharmacophore models. {F}or such a purpose, a machine learning method, support vector machine ({SVM}), was explored for the prediction of {P}-gp substrates. {A} set of 201 chemical compounds, including 116 substrates and 85 nonsubstrates of {P}-gp, was used to train and test a {SVM} classification system. {T}his {SVM} system gave a prediction accuracy of at least 81.2\% for {P}-gp substrates based on two different evaluation methods, which is substantially improved against that obtained from the multiple-pharmacophore model. {T}he prediction accuracy for nonsubstrates of {P}-gp is 79.2\% using 5-fold cross-validation. {T}hese accuracies are slightly better than those obtained from other statistical classification methods, including k-nearest neighbor (k-{NN}), probabilistic neural networks ({PNN}), and {C}4.5 decision tree, that use the same sets of data and molecular descriptors. {O}ur study indicates the potential of {SVM} in facilitating the prediction of {P}-gp substrates.}, doi = {10.1021/ci049971e}, pdf = {../local/Xue2004Prediction.pdf}, file = {Xue2004Prediction.pdf:local/Xue2004Prediction.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1021/ci049971e} }
@article{Yabuki2005GRIFFIN, author = {Yabuki, Y. and Muramatsu, T. and Hirokawa, T. and Mukai, H. and Suwa, M.}, title = {G{RIFFIN}: a system for predicting {GPCR}-{G}-protein coupling selectivity using a support vector machine and a hidden {M}arkov model.}, journal = {Nucleic {A}cids {R}es.}, year = {2005}, volume = {33}, pages = {W148-53}, number = {Web Server issue}, month = {Jul}, abstract = {We describe a novel system, {GRIFFIN} ({G}-protein and {R}eceptor {I}nteraction {F}eature {F}inding {IN}strument), that predicts {G}-protein coupled receptor ({GPCR}) and {G}-protein coupling selectivity based on a support vector machine ({SVM}) and a hidden {M}arkov model ({HMM}) with high sensitivity and specificity. {B}ased on our assumption that whole structural segments of ligands, {GPCR}s and {G}-proteins are essential to determine {GPCR} and {G}-protein coupling, various quantitative features were selected for ligands, {GPCR}s and {G}-protein complex structures, and those parameters that are the most effective in selecting {G}-protein type were used as feature vectors in the {SVM}. {T}he main part of {GRIFFIN} includes a hierarchical {SVM} classifier using the feature vectors, which is useful for {C}lass {A} {GPCR}s, the major family. {F}or the opsins and olfactory subfamilies of {C}lass {A} and other minor families ({C}lasses {B}, {C}, frizzled and smoothened), the binding {G}-protein is predicted with high accuracy using the {HMM}. {A}pplying this system to known {GPCR} sequences, each binding {G}-protein is predicted with high sensitivity and specificity (>85\% on average). {GRIFFIN} (http://griffin.cbrc.jp/) is freely available and allows users to easily execute this reliable prediction of {G}-proteins.}, doi = {10.1093/nar/gki495}, pdf = {../local/Yabuki2005GRIFFIN.pdf}, file = {Yabuki2005GRIFFIN.pdf:local/Yabuki2005GRIFFIN.pdf:PDF}, keywords = {biosvm}, pii = {33/suppl_2/W148}, url = {http://dx.doi.org/10.1093/nar/gki495} }
@incollection{Yamanishi2004Heterogeneous, author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.}, title = {Heterogeneous data comparison and gene selection with kernel canonical correlation analysis}, booktitle = {Kernel {M}ethods in {C}omputational {B}iology}, publisher = {MIT Press}, year = {2004}, editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.}, pages = {209-230}, pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\}, file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF}, keywords = {biosvm}, owner = {vert} }
@article{Yamanishi2005Supervised, author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.}, title = {Supervised enzyme network inference from the integration of genomic data and chemical information}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i468-i477}, abstract = {Motivation: {T}he metabolic network is an important biological network which relates enzyme proteins and chemical compounds. {A} large number of metabolic pathways remain unknown nowadays, and many enzymes are missing even in known metabolic pathways. {T}here is, therefore, an incentive to develop methods to reconstruct the unknown parts of the metabolic network and to identify genes coding for missing enzymes. {R}esults: {T}his paper presents new methods to infer enzyme networks from the integration of multiple genomic data and chemical information, in the framework of supervised graph inference. {T}he originality of the methods is the introduction of chemical compatibility as a constraint for refining the network predicted by the network inference engine. {T}he chemical compatibility between two enzymes is obtained automatically from the information encoded by their {E}nzyme {C}ommission ({EC}) numbers. {T}he proposed methods are tested and compared on their ability to infer the enzyme network of the yeast {S}accharomyces cerevisiae from four datasets for enzymes with assigned {EC} numbers: gene expression data, protein localization data, phylogenetic profiles and chemical compatibility information. {I}t is shown that the prediction accuracy of the network reconstruction consistently improves owing to the introduction of chemical constraints, the use of a supervised approach and the weighted integration of multiple datasets. {F}inally, we conduct a comprehensive prediction of a global enzyme network consisting of all enzyme candidate proteins of the yeast to obtain new biological findings.}, doi = {10.1093/bioinformatics/bti1012}, pdf = {../local/Yamanishi2005Supervised.pdf}, file = {Yamanishi2005Supervised.pdf:local/Yamanishi2005Supervised.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1012} }
@article{Yamanishi2004Protein, author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.}, title = {Protein network inference from multiple genomic data: a supervised approach}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {i363-i370}, abstract = {Motivation: {A}n increasing number of observations support the hypothesis that most biological functions involve the interactions between many proteins, and that the complexity of living systems arises as a result of such interactions. {I}n this context, the problem of inferring a global protein network for a given organism, using all available genomic data about the organism, is quickly becoming one of the main challenges in current computational biology. {R}esults: {T}his paper presents a new method to infer protein networks from multiple types of genomic data. {B}ased on a variant of kernel canonical correlation analysis, its originality is in the formalization of the protein network inference problem as a supervised learning problem, and in the integration of heterogeneous genomic data within this framework. {W}e present promising results on the prediction of the protein network for the yeast {S}accharomyces cerevisiae from four types of widely available data: gene expressions, protein interactions measured by yeast two-hybrid systems, protein localizations in the cell and protein phylogenetic profiles. {T}he method is shown to outperform other unsupervised protein network inference methods. {W}e finally conduct a comprehensive prediction of the protein network for all proteins of the yeast, which enables us to propose protein candidates for missing enzymes in a biosynthesis pathway. {A}vailability: {S}oftwares are available upon request.}, pdf = {../local/Yamanishi2004Protein.pdf}, file = {Yamanishi2004Protein.pdf:local/Yamanishi2004Protein.pdf:PDF}, keywords = {biosvm}, owner = {vert}, url = {http://bioinformatics.oupjournals.org/cgi/reprint/19/suppl\_1/i323} }
@article{Yamanishi2003Extraction, author = {Yamanishi, Y. and Vert, J.-P. and Nakaya, A. and Kanehisa, M.}, title = {Extraction of correlated gene clusters from multiple genomic data by generalized kernel canonical correlation analysis}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {i323-i330}, number = {Suppl. 1}, abstract = {Motivation: {A} major issue in computational biology is the reconstruction of pathways from several genomic datasets, such as expression data, protein interaction data and phylogenetic profiles. {A}s a first step toward this goal, it is important to investigate the amount of correlation which exists between these data. {R}esults: {T}hese methods are successfully tested on their ability to recognize operons in the {E}scherichia coli genome, from the comparison of three datasets corresponding to functional relationships between genes in metabolic pathways, geometrical relationships along the chromosome, and co-expression relationships as observed by gene expression data. {C}ontact: yoshi@kuicr.kyoto-u.ac.jp}, pdf = {../local/Yamanishi2003Extraction.pdf}, file = {Yamanishi2003Extraction.pdf:local/Yamanishi2003Extraction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i323} }
@article{Yan2004two-stage, author = {Yan, C. and Dobbs, D. and Honavar, V.}, title = {A two-stage classifier for identification of protein-protein interface residues}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {i371-i378}, number = {Suppl. 1}, abstract = {Motivation: {T}he ability to identify protein-protein interaction sites and to detect specific amino acid residues that contribute to the specificity and affinity of protein interactions has important implications for problems ranging from rational drug design to analysis of metabolic and signal transduction networks. {R}esults: {W}e have developed a two-stage method consisting of a support vector machine ({SVM}) and a {B}ayesian classifier for predicting surface residues of a protein that participate in protein-protein interactions. {T}his approach exploits the fact that interface residues tend to form clusters in the primary amino acid sequence. {O}ur results show that the proposed two-stage classifier outperforms previously published sequence-based methods for predicting interface residues. {W}e also present results obtained using the two-stage classifier on an independent test set of seven {CAPRI} ({C}ritical {A}ssessment of {PR}edicted {I}nteractions) targets. {T}he success of the predictions is validated by examining the predictions in the context of the three-dimensional structures of protein complexes. {S}upplementary information: http://www.public.iastate.edu/~chhyan/{ISMB}2004/list.html}, pdf = {../local/Yan2004two-stage.pdf}, file = {Yan2004two-stage.pdf:local/Yan2004two-stage.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i371} }
@article{Yan2004Identification, author = {Yan, C. and Honavar, V. and Dobbs, D.}, title = {Identification of interface residues in protease-inhibitor and antigen-antibody complexes: a support vector machine}, journal = {Neural {C}omput. \& {A}pplic.}, year = {2004}, volume = {13}, pages = {123-129}, doi = {10.1007/s00521-004-0414-3}, pdf = {../local/Yan2004Identification.pdf}, file = {Yan2004Identification.pdf:local/Yan2004Identification.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Yang2004Bio-support, author = {Yang, Z. R. and Chou, K.-C.}, title = {Bio-support vector machines for computational proteomics}, journal = {Bioinformatics}, year = {2004}, volume = {20}, pages = {735-741}, number = {5}, abstract = {Motivation: {O}ne of the most important issues in computational proteomics is to produce a prediction model for the classification or annotation of biological function of novel protein sequences. {I}n order to improve the prediction accuracy, much attention has been paid to the improvement of the performance of the algorithms used, few is for solving the fundamental issue, namely, amino acid encoding as most existing pattern recognition algorithms are unable to recognize amino acids in protein sequences. {I}mportantly, the most commonly used amino acid encoding method has the flaw that leads to large computational cost and recognition bias. {R}esults: {B}y replacing kernel functions of support vector machines ({SVM}s) with amino acid similarity measurement matrices, we have modified {SVM}s, a new type of pattern recognition algorithm for analysing protein sequences, particularly for proteolytic cleavage site prediction. {W}e refer to the modified {SVM}s as bio-support vector machine. {W}hen applied to the prediction of {HIV} protease cleavage sites, the new method has shown a remarkable advantage in reducing the model complexity and enhancing the model robustness.}, pdf = {../local/Yang2004Bio-support.pdf}, file = {Yang2004Bio-support.pdf:local/Yang2004Bio-support.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/5/735} }
@article{Yao2004Comparative, author = {X. J. Yao and A. Panaye and J. P. Doucet and R. S. Zhang and H. F. Chen and M. C. Liu and Z. D. Hu and B. T. Fan}, title = {Comparative study of {QSAR}/{QSPR} correlations using support vector machines, radial basis function neural networks, and multiple linear regression.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2004}, volume = {44}, pages = {1257-66}, number = {4}, abstract = {Support vector machines ({SVM}s) were used to develop {QSAR} models that correlate molecular structures to their toxicity and bioactivities. {T}he performance and predictive ability of {SVM} are investigated and compared with other methods such as multiple linear regression and radial basis function neural network methods. {I}n the present study, two different data sets were evaluated. {T}he first one involves an application of {SVM} to the development of a {QSAR} model for the prediction of toxicities of 153 phenols, and the second investigation deals with the {QSAR} model between the structures and the activities of a set of 85 cyclooxygenase 2 ({COX}-2) inhibitors. {F}or each application, the molecular structures were described using either the physicochemical parameters or molecular descriptors. {I}n both studied cases, the predictive ability of the {SVM} model is comparable or superior to those obtained by {MLR} and {RBFNN}. {T}he results indicate that {SVM} can be used as an alternative powerful modeling tool for {QSAR} studies.}, doi = {10.1021/ci049965i}, pdf = {../local/Yao2004Comparative.pdf}, file = {Yao2004Comparative.pdf:local/Yao2004Comparative.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci049965i} }
@article{Yap2004Prediction, author = {C. W. Yap and C. Z. Cai and Y. Xue and Y. Z. Chen}, title = {Prediction of torsade-causing potential of drugs by support vector machine approach.}, journal = {Toxicol {S}ci}, year = {2004}, volume = {79}, pages = {170-7}, number = {1}, month = {May}, abstract = {In an effort to facilitate drug discovery, computational methods for facilitating the prediction of various adverse drug reactions ({ADR}s) have been developed. {S}o far, attention has not been sufficiently paid to the development of methods for the prediction of serious {ADR}s that occur less frequently. {S}ome of these {ADR}s, such as torsade de pointes ({T}d{P}), are important issues in the approval of drugs for certain diseases. {T}hus there is a need to develop tools for facilitating the prediction of these {ADR}s. {T}his work explores the use of a statistical learning method, support vector machine ({SVM}), for {T}d{P} prediction. {T}d{P} involves multiple mechanisms and {SVM} is a method suitable for such a problem. {O}ur {SVM} classification system used a set of linear solvation energy relationship ({LSER}) descriptors and was optimized by leave-one-out cross validation procedure. {I}ts prediction accuracy was evaluated by using an independent set of agents and by comparison with results obtained from other commonly used classification methods using the same dataset and optimization procedure. {T}he accuracies for the {SVM} prediction of {T}d{P}-causing agents and non-{T}d{P}-causing agents are 97.4 and 84.6\% respectively; one is substantially improved against and the other is comparable to the results obtained by other classification methods useful for multiple-mechanism prediction problems. {T}his indicates the potential of {SVM} in facilitating the prediction of {T}d{P}-causing risk of small molecules and perhaps other {ADR}s that involve multiple mechanisms.}, doi = {10.1093/toxsci/kfh082}, pdf = {../local/Yap2004Prediction.pdf}, file = {Yap2004Prediction.pdf:local/Yap2004Prediction.pdf:PDF}, keywords = {biosvm chemoinformatics}, pii = {kfh082}, url = {http://dx.doi.org/10.1093/toxsci/kfh082} }
@article{Yap2005Prediction, author = {C. W. Yap and Y. Z. Chen}, title = {Prediction of {C}ytochrome {P}450 3{A}4, 2{D}6, and 2{C}9 {I}nhibitors and {S}ubstrates by {U}sing {S}upport {V}ector {M}achines.}, journal = {J {C}hem {I}nf {M}odel}, year = {2005}, volume = {45}, pages = {982-92}, number = {4}, abstract = {Statistical learning methods have been used in developing filters for predicting inhibitors of two {P}450 isoenzymes, {CYP}3{A}4 and {CYP}2{D}6. {T}his work explores the use of different statistical learning methods for predicting inhibitors of these enzymes and an additional {P}450 enzyme, {CYP}2{C}9, and the substrates of the three {P}450 isoenzymes. {T}wo consensus support vector machine ({CSVM}) methods, "positive majority" ({PM}-{CSVM}) and "positive probability" ({PP}-{CSVM}), were used in this work. {T}hese methods were first tested for the prediction of inhibitors of {CYP}3{A}4 and {CYP}2{D}6 by using a significantly higher number of inhibitors and noninhibitors than that used in earlier studies. {T}hey were then applied to the prediction of inhibitors of {CYP}2{C}9 and substrates of the three enzymes. {B}oth methods predict inhibitors of {CYP}3{A}4 and {CYP}2{D}6 at a similar level of accuracy as those of earlier studies. {F}or classification of inhibitors of {CYP}2{C}9, the best {CSVM} method gives an accuracy of 88.9\% for inhibitors and 96.3\% for noninhibitors. {T}he accuracies for classification of substrates and nonsubstrates of {CYP}3{A}4, {CYP}2{D}6, and {CYP}2{C}9 are 98.2 and 90.9\%, 96.6 and 94.4\%, and 85.7 and 98.8\%, respectively. {B}oth {CSVM} methods are potentially useful as filters for predicting inhibitors and substrates of {P}450 isoenzymes. {T}hese methods generally give better accuracies than single {SVM} classification systems, and the performance of the {PP}-{CSVM} method is slightly better than that of the {PM}-{CSVM} method.}, doi = {10.1021/ci0500536}, pdf = {../local/Yap2005Prediction.pdf}, file = {Yap2005Prediction.pdf:local/Yap2005Prediction.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci0500536} }
@article{Yeang2001Molecular, author = {Yeang, C.H. and Ramaswamy, S. and Tamayo, P. and Mukherjee, S. and Rifkin, R.M. and Angelo, M. and Reich, M. and Lander, E. and Mesirov, J. and Golub, T.}, title = {Molecular classification of multiple tumor types}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {S316--S322}, number = {Suppl. 1}, abstract = {Using gene expression data to classify tumor types is a very promising tool in cancer diagnosis. {P}revious works show several pairs of tumor types can be successfully distinguished by their gene expression patterns ({G}olub et al. 1999, {B}en-{D}or et al. 2000, {A}lizadeh et al. 2000). {H}owever, the simultaneous classification across a heterogeneous set of tumor types has not been well studied yet. {W}e obtained 190 samples from 14 tumor classes and generated a combined expression dataset containing 16063 genes for each of those samples. {W}e performed multi-class classification by combining the outputs of binary classifiers. {T}hree binary classifiers (k-nearest neighbors, weighted voting, and support vector machines) were applied in conjunction with three combination scenarios (one-vs-all, all-pairs, hierarchical partitioning). {W}e achieved the best cross validation error rate of 18.75% and the best test error rate of 21.74% by using the one-vs-all support vector machine algorithm. {T}he results demonstrate the feasibility of performing clinically useful classification from samples of multiple tumor types.}, pdf = {../local/Yeang2001Molecular.pdf}, file = {Yeang2001Molecular.pdf:local/Yeang2001Molecular.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/suppl_1/S316} }
@article{Yiu2005Filtering, author = {Yiu, S. M. and Wong, Prudence W. H. and Lam, T.W. and Mui, Y.C. and Kung, H. F. and Lin, Marie and Cheung, Y. T.}, title = {Filtering of {I}neffective si{RNA}s and {I}mproved si{RNA} {D}esign {T}ool}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {144-151}, number = {2}, month = {Jan}, note = {To appear}, abstract = {Motivation: {S}hort interfering {RNA}s (si{RNA}s) can be used to suppress gene expression and possess many potential applications in therapy, but how to design an effective si{RNA} is still not clear. {B}ased on the {MPI} ({M}ax-{P}lanck-{I}nstitute) basic principles, a number of si{RNA} design tools have been developed recently. {T}he set of candidates reported by these tools is usually large and often contains ineffective si{RNA}s. {I}n view of this, we initiate the study of filtering ineffective si{RNA}s. {R}esults: {T}he contribution of this paper is 2-fold. {F}irst, we propose a fair scheme to compare existing design tools based on real data in the literature. {S}econd, we attempt to improve the {MPI} principles and existing tools by an algorithm that can filter ineffective si{RNA}s. {T}he algorithm is based on some new observations on the secondary structure, which we have verified by {AI} techniques (decision trees and support vector machines). {W}e have tested our algorithm together with the {MPI} principles and the existing tools. {T}he results show that our filtering algorithm is effective. {A}vailability: {T}he si{RNA} design software tool can be found in the website http://www.cs.hku.hk/~sirna/ {C}ontact: smyiu@cs.hku.hk}, doi = {10.1093/bioinformatics/bth498}, pdf = {../local/Yiu2005Filtering.pdf}, file = {Yiu2005Filtering.pdf:local/Yiu2005Filtering.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/21/2/144} }
@article{Yoon2003Analysis, author = {Yoon, Y. and Song, J. and Hong, S.H. and Kim, J.Q.}, title = {Analysis of multiple single nucleotide polymorphisms of candidate genes related to coronary heart disease susceptibility by using support vector machines}, journal = {Clin. {C}hem. {L}ab. {M}ed.}, year = {2003}, volume = {41}, pages = {529-534}, number = {4}, abstract = {Coronary heart disease ({CHD}) is a complex genetic disease involving gene-environment interaction. {M}any association studies between single nucleotide polymorphisms ({SNP}s) of candidate genes and {CHD} have been reported. {W}e have applied a new method to analyze such relationships using support vector machines ({SVM}s), which is one of the methods for artificial neuronal network. {W}e assumed that common haplotype implicit in genotypes will differ between cases and controls, and that this will allow {SVM}-derived patterns to be classifiable according to subject genotypes. {F}ourteen {SNP}s of ten candidate genes in 86 {CHD} patients and 119 controls were investigated. {G}enotypes were transformed to a numerical vector by giving scores based on difference between the genotypes of each subject and the reference genotypes, which represent the healthy normal population. {O}verall classification accuracy by {SVM}s was 64.4% with a receiver operating characteristic ({ROC}) area of 0.639. {B}y conventional analysis using the chi2 test, the association between {CHD} and the {SNP} of the scavenger receptor {B}1 gene was most significant in terms of allele frequencies in cases vs. controls (p = 0.0001). {I}n conclusion, we suggest that the application of {SVM}s for association studies of {SNP}s in candidate genes shows considerable promise and that further work could be usefully performed upon the estimation of {CHD} susceptibility in individuals of high risk.}, pdf = {../local/Yoon2003Analysis.pdf}, file = {Yoon2003Analysis.pdf:local/Yoon2003Analysis.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.degruyter.de/journals/cclm/abs/10592.html} }
@article{Yu2003Fine-grained, author = {Yu, C.S. and Wang, J.Y. and Yang, J.M. and Lyu, P.C. and Lin, C.J. and Hwang, J.K.}, title = {Fine-grained protein fold assignment by support vector machines using generalized npeptide coding schemes and jury voting from multiple-parameter sets.}, journal = {Proteins}, year = {2003}, volume = {50}, pages = {531}, number = {4}, month = {6}, abstract = {In the coarse-grained fold assignment of major protein classes, such as all-alpha, all-beta, alpha + beta, alpha/beta proteins, one can easily achieve high prediction accuracy from primary amino acid sequences. {H}owever, the fine-grained assignment of folds, such as those defined in the {S}tructural {C}lassification of {P}roteins ({SCOP}) database, presents a challenge due to the larger amount of folds available. {R}ecent study yielded reasonable prediction accuracy of 56.0% on an independent set of 27 most populated folds. {I}n this communication, we apply the support vector machine ({SVM}) method, using a combination of protein descriptors based on the properties derived from the composition of n-peptide and jury voting, to the fine-grained fold prediction, and are able to achieve an overall prediction accuracy of 69.6% on the same independent set-significantly higher than the previous results. {O}n 10-fold cross-validation, we obtained a prediction accuracy of 65.3%. {O}ur results show that {SVM} coupled with suitable global sequence-coding schemes can significantly improve the fine-grained fold prediction. {O}ur approach should be useful in structure prediction and modeling.}, doi = {10.1002/prot.10313}, pdf = {../local/Yu2003Fine-grained.pdf}, file = {Yu2003Fine-grained.pdf:local/Yu2003Fine-grained.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.10313} }
@article{Yu2005Classifying, author = {Yu, C. and Zavaljevski, N. and Stevens, F. J. and Yackovich, K. and Reifman, J.}, title = {Classifying noisy protein sequence data: a case study of immunoglobulin light chains.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {i495-i501}, number = {Supp 1}, month = {Jun}, abstract = {S{UMMARY}: {T}he classification of protein sequences obtained from patients with various immunoglobulin-related conformational diseases may provide insight into structural correlates of pathogenicity. {H}owever, clinical data are very sparse and, in the case of antibody-related proteins, the collected sequences have large variability with only a small subset of variations relevant to the protein pathogenicity (function). {O}n this basis, these sequences represent a model system for development of strategies to recognize the small subset of function-determining variations among the much larger number of primary structure diversifications introduced during evolution. {U}nder such conditions, most protein classification algorithms have limited accuracy. {T}o address this problem, we propose a support vector machine ({SVM})-based classifier that combines sequence and 3{D} structural averaging information. {E}ach amino acid in the sequence is represented by a set of six physicochemical properties: hydrophobicity, hydrophilicity, volume, surface area, bulkiness and refractivity. {E}ach position in the sequence is described by the properties of the amino acid at that position and the properties of its neighbors in 3{D} space or in the sequence. {A} structure template is selected to determine neighbors in 3{D} space and a window size is used to determine the neighbors in the sequence. {T}he test data consist of 209 proteins of human antibody immunoglobulin light chains, each represented by aligned sequences of 120 amino acids. {T}he methodology is applied to the classification of protein sequences collected from patients with and without amyloidosis, and indicates that the proposed modified classifiers are more robust to sequence variability than standard {SVM} classifiers, improving classification error between 5 and 25\% and sensitivity between 9 and 17\%. {T}he classification results might also suggest possible mechanisms for the propensity of immunoglobulin light chains to amyloid formation. {CONTACT}: cyu@bioanalysis.org.}, doi = {10.1093/bioinformatics/bti1024}, pdf = {../local/Yu2005Classifying.pdf}, file = {Yu2005Classifying.pdf:local/Yu2005Classifying.pdf:PDF}, keywords = {biosvm}, pii = {21/suppl_1/i495}, url = {http://dx.doi.org/10.1093/bioinformatics/bti1024} }
@article{Yu2004Predicting, author = {Yu, C.-S. and Lin, C.-J. and Hwang, J.-K.}, title = {Predicting subcellular localization of proteins for {G}ram-negative bacteria by support vector machines based on n-peptide compositions}, journal = {Protein {S}ci.}, year = {2004}, volume = {13}, pages = {1402-1406}, number = {5}, abstract = {Gram-negative bacteria have five major subcellular localization sites: the cytoplasm, the periplasm, the inner membrane, the outer membrane, and the extracellular space. {T}he subcellular location of a protein can provide valuable information about its function. {W}ith the rapid increase of sequenced genomic data, the need for an automated and accurate tool to predict subcellular localization becomes increasingly important. {W}e present an approach to predict subcellular localization for {G}ram-negative bacteria. {T}his method uses the support vector machines trained by multiple feature vectors based on n-peptide compositions. {F}or a standard data set comprising 1443 proteins, the overall prediction accuracy reaches 89%, which, to the best of our knowledge, is the highest prediction rate ever reported. {O}ur prediction is 14% higher than that of the recently developed multimodular {PSORT}-{B}. {B}ecause of its simplicity, this approach can be easily extended to other organisms and should be a useful tool for the high-throughput and large-scale analysis of proteomic and genomic data.}, doi = {10.1110/ps.03479604}, pdf = {../local/Yu2004Predicting.pdf}, file = {Yu2004Predicting.pdf:local/Yu2004Predicting.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.proteinscience.org/cgi/content/abstract/13/5/1402} }
@article{Yu2004integrated, author = {Yu, J.K. and Chen, Y.D. and Zheng, S.}, title = {An integrated approach to the detection of colorectal cancer utilizing proteomics and bioinformatics}, journal = {World {J}. {G}astroenterol.}, year = {2004}, volume = {10}, pages = {3127-3131}, number = {21}, abstract = {A{IM}: {T}o find new potential biomarkers and to establish patterns for early detection of colorectal cancer. {METHODS}: {O}ne hundred and eighty-two serum samples including 55 from colorectal cancer ({CRC}) patients, 35 from colorectal adenoma ({CRA}) patients and 92 from healthy persons ({HP}) were detected by surface-enhanced laser desorption/ionization mass spectrometry ({SELDI}-{MS}). {T}he data of spectra were analyzed by bioinformatics tools like artificial neural network ({ANN}) and support vector machine ({SVM}). {RESULTS}: {T}he diagnostic pattern combined with 7 potential biomarkers could differentiate {CRC} patients from {CRA} patients with a specificity of 83%, sensitivity of 89% and positive predictive value of 89%. {T}he diagnostic pattern combined with 4 potential biomarkers could differentiate {CRC} patients from {HP} with a specificity of 92%, sensitivity of 89% and positive predictive value of 86%. {CONCLUSION}: {T}he combination of {SELDI} with bioinformatics tools could help find new biomarkers and establish patterns with high sensitivity and specificity for the detection of {CRC}.}, pdf = {../local/Yu2004integrated.pdf}, file = {Yu2004integrated.pdf:local/Yu2004integrated.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert} }
@article{Yu2005Ovarian, author = {J. S. Yu and S. Ongarello and R. Fiedler and X. W. Chen and G. Toffolo and C. Cobelli and Z. Trajanoski}, title = {Ovarian cancer identification based on dimensionality reduction for high-throughput mass spectrometry data.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2200-9}, number = {10}, month = {May}, abstract = {M{OTIVATION}: {H}igh-throughput and high-resolution mass spectrometry instruments are increasingly used for disease classification and therapeutic guidance. {H}owever, the analysis of immense amount of data poses considerable challenges. {W}e have therefore developed a novel method for dimensionality reduction and tested on a published ovarian high-resolution {SELDI}-{TOF} dataset. {RESULTS}: {W}e have developed a four-step strategy for data preprocessing based on: (1) binning, (2) {K}olmogorov-{S}mirnov test, (3) restriction of coefficient of variation and (4) wavelet analysis. {S}ubsequently, support vector machines were used for classification. {T}he developed method achieves an average sensitivity of 97.38\% (sd = 0.0125) and an average specificity of 93.30\% (sd = 0.0174) in 1000 independent k-fold cross-validations, where k = 2, ..., 10. {AVAILABILITY}: {T}he software is available for academic and non-commercial institutions.}, doi = {10.1093/bioinformatics/bti370}, pdf = {../local/Yu2005Ovarian.pdf}, file = {Yu2005Ovarian.pdf:local/Yu2005Ovarian.pdf:PDF}, keywords = {biosvm proteomics}, pii = {bti370}, url = {http://dx.doi.org/10.1093/bioinformatics/bti370} }
@article{Yu2005integrated, author = {Yu, J.-k. and Zheng, S. and Tang, Y. and Li, L.}, title = {An integrated approach utilizing proteomics and bioinformatics to detect ovarian cancer.}, journal = {J {Z}hejiang {U}niv {S}ci {B}}, year = {2005}, volume = {6}, pages = {227-31}, number = {4}, month = {Apr}, abstract = {O{BJECTIVE}: {T}o find new potential biomarkers and establish the patterns for the detection of ovarian cancer. {METHODS}: {S}ixty one serum samples including 32 ovarian cancer patients and 29 healthy people were detected by surface-enhanced laser desorption/ionization mass spectrometry ({SELDI}-{MS}). {T}he protein fingerprint data were analyzed by bioinformatics tools. {T}en folds cross-validation support vector machine ({SVM}) was used to establish the diagnostic pattern. {RESULTS}: {F}ive potential biomarkers were found (2085 {D}a, 5881 {D}a, 7564 {D}a, 9422 {D}a, 6044 {D}a), combined with which the diagnostic pattern separated the ovarian cancer from the healthy samples with a sensitivity of 96.7\%, a specificity of 96.7\% and a positive predictive value of 96.7\%. {CONCLUSIONS}: {T}he combination of {SELDI} with bioinformatics tools could find new biomarkers and establish patterns with high sensitivity and specificity for the detection of ovarian cancer.}, doi = {10.1631/jzus.2005.B0227}, pdf = {../local/Yu2005integrated.pdf}, file = {Yu2005integrated.pdf:local/Yu2005integrated.pdf:PDF}, keywords = {biosvm}, url = {http://dx.doi.org/10.1631/jzus.2005.B0227} }
@article{Yuan2002Prediction, author = {Yuan, Z. and Burrage, K. and Mattick, J.S.}, title = {Prediction of protein solvent accessibility using support vector machines}, journal = {Proteins}, year = {2002}, volume = {48}, pages = {566-570}, number = {3}, abstract = {A {S}upport {V}ector {M}achine learning system has been trained to predict protein solvent accessibility from the primary structure. {D}ifferent kernel functions and sliding window sizes have been explored to find how they affect the prediction performance. {U}sing a cut-off threshold of 15% that splits the dataset evenly (an equal number of exposed and buried residues), this method was able to achieve a prediction accuracy of 70.1% for single sequence input and 73.9% for multiple alignment sequence input, respectively. {T}he prediction of three and more states of solvent accessibility was also studied and compared with other methods. {T}he prediction accuracies are better than, or comparable to, those obtained by other methods such as neural networks, {B}ayesian classification, multiple linear regression, and information theory. {I}n addition, our results further suggest that this system may be combined with other prediction methods to achieve more reliable results, and that the {S}upport {V}ector {M}achine method is a very useful tool for biological sequence analysis.}, doi = {10.1002/prot.10176}, pdf = {../local/Yuan2002Prediction.pdf}, file = {Yuan2002Prediction.pdf:local/Yuan2002Prediction.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/prot.10176} }
@article{Yuan2004SVMtm, author = {Yuan, Z. and Mattick, J.S. and Teasdale, R.D.}, title = {{{SVM}tm}: support vector machines to predict transmembrane segments.}, journal = {J. {C}omput. {C}hem.}, year = {2004}, volume = {25}, pages = {632}, number = {5}, month = {6}, abstract = {A new method has been developed for prediction of transmembrane helices using support vector machines. {D}ifferent coding schemes of protein sequences were explored, and their performances were assessed by crossvalidation tests. {T}he best performance method can predict the transmembrane helices with sensitivity of 93.4% and precision of 92.0%. {F}or each predicted transmembrane segment, a score is given to show the strength of transmembrane signal and the prediction reliability. {I}n particular, this method can distinguish transmembrane proteins from soluble proteins with an accuracy of approximately 99%. {T}his method can be used to complement current transmembrane helix prediction methods and can be used for consensus analysis of entire proteomes. {T}he predictor is located at http://genet.imb.uq.edu.au/predictors/{SVM}tm.}, doi = {10.1002/jcc.10411}, pdf = {../local/Yuan2004SVMtm.pdf}, file = {Yuan2004SVMtm.pdf:local/Yuan2004SVMtm.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1002/jcc.10411} }
@article{Zaki2005Application, author = {Zaki, N. M. and Deris, S. and Illias, R.}, title = {Application of string kernels in protein sequence classification.}, journal = {Appl. {B}ioinformatics}, year = {2005}, volume = {4}, pages = {45-52}, number = {1}, abstract = {I{NTRODUCTION}: {T}he production of biological information has become much greater than its consumption. {T}he key issue now is how to organise and manage the huge amount of novel information to facilitate access to this useful and important biological information. {O}ne core problem in classifying biological information is the annotation of new protein sequences with structural and functional features. {METHOD}: {T}his article introduces the application of string kernels in classifying protein sequences into homogeneous families. {A} string kernel approach used in conjunction with support vector machines has been shown to achieve good performance in text categorisation tasks. {W}e evaluated and analysed the performance of this approach, and we present experimental results on three selected families from the {SCOP} ({S}tructural {C}lassification of {P}roteins) database. {W}e then compared the overall performance of this method with the existing protein classification methods on benchmark {SCOP} datasets. {RESULTS}: {A}ccording to the {F}1 performance measure and the rate of false positive ({RFP}) measure, the string kernel method performs well in classifying protein sequences. {T}he method outperformed all the generative-based methods and is comparable with the {SVM}-{F}isher method. {DISCUSSION}: {A}lthough the string kernel approach makes no use of prior biological knowledge, it still captures sufficient biological information to enable it to outperform some of the state-of-the-art methods.}, keywords = {biosvm}, pii = {415} }
@article{Zavaljevski2002Support, author = {Zavaljevski, N. and Stevens, F.J. and Reifman, J.}, title = {Support vector machines with selective kernel scaling for protein classification and identification of key amino acid positions }, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {689--696}, number = {5}, abstract = {Motivation: {D}ata that characterize primary and tertiary structures of proteins are now accumulating at a rapid and accelerating rate and require automated computational tools to extract critical information relating amino acid changes with the spectrum of functionally attributes exhibited by a protein. {W}e propose that immunoglobulin-type beta-domains, which are found in approximate 400 functionally distinct forms in humans alone, provide the immense genetic variation within limited conformational changes that might facilitate the development of new computational tools. {A}s an initial step, we describe here an approach based on {S}upport {V}ector {M}achine ({SVM}) technology to identify amino acid variations that contribute to the functional attribute of pathological self-assembly by some human antibody light chains produced during plasma cell diseases. {R}esults: {W}e demonstrate that {SVM}s with selective kernel scaling are an effective tool in discriminating between benign and pathologic human immunoglobulin light chains. {I}nitial results compare favorably against manual classification performed by experts and indicate the capability of {SVM}s to capture the underlying structure of the data. {T}he data set consists of 70 proteins of human antibody 1 light chains, each represented by aligned sequences of 120 amino acids. {W}e perform feature selection based on a first-order adaptive scaling algorithm, which confirms the importance of changes in certain amino acid positions and identifies other positions that are key in the characterization of protein function.}, pdf = {../local/zava02.pdf}, file = {zava02.pdf:local/zava02.pdf:PDF}, keywords = {biosvm}, subject = {biokernel}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/5/689} }
@article{Zernov2003Drug, author = {V. V. Zernov and K. V. Balakin and A. A. Ivaschenko and N. P. Savchuk and I. V. Pletnev}, title = {Drug discovery using support vector machines. {T}he case studies of drug-likeness, agrochemical-likeness, and enzyme inhibition predictions.}, journal = {J {C}hem {I}nf {C}omput {S}ci}, year = {2003}, volume = {43}, pages = {2048-56}, number = {6}, abstract = {Support {V}ector {M}achines ({SVM}) is a powerful classification and regression tool that is becoming increasingly popular in various machine learning applications. {W}e tested the ability of {SVM}, in comparison with well-known neural network techniques, to predict drug-likeness and agrochemical-likeness for large compound collections. {F}or both kinds of data, {SVM} outperforms various neural networks using the same set of descriptors. {W}e also used {SVM} for estimating the activity of {C}arbonic {A}nhydrase {II} ({CA} {II}) enzyme inhibitors and found that the prediction quality of our {SVM} model is better than that reported earlier for conventional {QSAR}. {M}odel characteristics and data set features were studied in detail.}, doi = {10.1021/ci0340916}, pdf = {../local/Zernov2003Drug.pdf}, file = {Zernov2003Drug.pdf:local/Zernov2003Drug.pdf:PDF}, keywords = {biosvm chemoinformatics}, url = {http://dx.doi.org/10.1021/ci0340916} }
@article{Zhang2005Improved, author = {Qidong Zhang and Sukjoon Yoon and William J Welsh}, title = {Improved method for predicting beta-turn using support vector machine.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {2370-4}, number = {10}, month = {May}, abstract = {M{OTIVATION}: {N}umerous methods for predicting beta-turns in proteins have been developed based on various computational schemes. {H}ere, we introduce a new method of beta-turn prediction that uses the support vector machine ({SVM}) algorithm together with predicted secondary structure information. {V}arious parameters from the {SVM} have been adjusted to achieve optimal prediction performance. {RESULTS}: {T}he {SVM} method achieved excellent performance as measured by the {M}atthews correlation coefficient ({MCC} = 0.45) using a 7-fold cross validation on a database of 426 non-homologous protein chains. {T}o our best knowledge, this {MCC} value is the highest achieved so far for predicting beta-turn. {T}he overall prediction accuracy {Q}total was 77.3\%, which is the best among the existing prediction methods. {A}mong its unique attractive features, the present {SVM} method avoids overtraining and compresses information and provides a predicted reliability index.}, doi = {10.1093/bioinformatics/bti358}, pdf = {../local/Zhang2005Improved.pdf}, file = {Zhang2005Improved.pdf:local/Zhang2005Improved.pdf:PDF}, keywords = {biosvm}, pii = {bti358}, url = {http://dx.doi.org/10.1093/bioinformatics/bti358} }
@article{Zhang2003Classification, author = {Zhang, S.-W. and Pan, Q. and Zhang, H.-C. and Zhang, Y-L. and Wang, H.-Y.}, title = {Classification of protein quaternary structure with support vector machine}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {2390-2396}, number = {18}, abstract = {Motivation: {S}ince the gap between sharply increasing known sequences and slow accumulation of known structures is becoming large, an automatic classification process based on the primary sequences and known three-dimensional structure becomes indispensable. {T}he classification of protein quaternary structure based on the primary sequences can provide some useful information for the biologists. {S}o a fully automatic and reliable classification system is needed. {T}his work tries to look for the effective methods of extracting attribute and the algorithm for classifying the quaternary structure from the primary sequences. {R}esults: {B}oth of the support vector machine ({SVM}) and the covariant discriminant algorithms have been first introduced to predict quaternary structure properties from the protein primary sequences. {T}he amino acid composition and the auto-correlation functions based on the amino acid index profile of the primary sequence have been taken into account in the algorithms. {W}e have analyzed 472 amino acid indices and selected the four amino acid indices as the examples, which have the best performance. {T}hus the five attribute parameter data sets ({COMP}, {FASG}, {NISK}, {WOLS} and {KYTJ}) were established from the protein primary sequences. {T}he {COMP} attribute data set is composed of amino acid composition, and the {FASG}, {NISK}, {WOLS} and {KYTJ} attribute data sets are composed of the amino acid composition and the auto-correlation functions of the corresponding amino acid residue index. {T}he overall accuracies of {SVM} are 78.5, 87.5, 83.2, 81.7 and 81.9%, respectively, for {COMP}, {FASG}, {NISK}, {WOLS} and {KYTJ} data sets in jackknife test, which are 19.6, 7.8, 15.5, 13.1 and 15.8%, respectively, higher than that of the covariant discriminant algorithm in the same test. {T}he results show that {SVM} may be applied to discriminate between the primary sequences of homodimers and non-homodimers and the two protein sequence descriptors can reflect the quaternary structure information. {C}ompared with previous {R}obert {G}arian's investigation, the performance of {SVM} is almost equal to that of the {D}ecision tree models, and the methods of extracting feature vector from the primary sequences are superior to {R}obert's binning function method. {A}vailability: {P}rograms are available on request from the authors.}, pdf = {../local/Zhang2003Classification.pdf}, file = {Zhang2003Classification.pdf:local/Zhang2003Classification.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/18/2390} }
@article{Zhang2003Sequence, author = {Zhang, X. H-F. and Heller, K. A. and Hefter, I. and Leslie, C. S. and Chasin, L. A.}, title = {Sequence {I}nformation for the {S}plicing of {H}uman {P}re-m{RNA} {I}dentified by {S}upport {V}ector {M}achine {C}lassification}, journal = {Genome {R}es.}, year = {2003}, volume = {13}, pages = {2637-2650}, number = {12}, abstract = {Vertebrate pre-m{RNA} transcripts contain many sequences that resemble splice sites on the basis of agreement to the consensus, yet these more numerous false splice sites are usually completely ignored by the cellular splicing machinery. {E}ven at the level of exon definition, pseudo exons defined by such false splices sites outnumber real exons by an order of magnitude. {W}e used a support vector machine to discover sequence information that could be used to distinguish real exons from pseudo exons. {T}his machine learning tool led to the definition of potential branch points, an extended polypyrimidine tract, and {C}-rich and {TG}-rich motifs in a region limited to 50 nt upstream of constitutively spliced exons. {C}-rich sequences were also found in a region extending to 80 nt downstream of exons, along with {G}-triplet motifs. {I}n addition, it was shown that combinations of three bases within the splice donor consensus sequence were more effective than consensus values in distinguishing real from pseudo splice sites; two-way base combinations were optimal for distinguishing 3' splice sites. {T}hese data also suggest that interactions between two or more of these elements may contribute to exon recognition, and provide candidate sequences for assessment as intronic splicing enhancers.}, doi = {10.1101/gr.1679003}, pdf = {../local/Zhang2003Sequence.pdf}, file = {Zhang2003Sequence.pdf:local/Zhang2003Sequence.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://www.genome.org/cgi/content/abstract/13/12/2637} }
@article{Zhang2005Descriptor-based, author = {Zhang, Z. and Kochhar, S. and Grigorov, M. G.}, title = {Descriptor-based protein remote homology identification.}, journal = {Protein {S}ci.}, year = {2005}, volume = {42}, pages = {431-444}, number = {2}, abstract = {Here, we report a novel protein sequence descriptor-based remote homology identification method, able to infer fold relationships without the explicit knowledge of structure. {I}n a first phase, we have individually benchmarked 13 different descriptor types in fold identification experiments in a highly diverse set of protein sequences. {T}he relevant descriptors were related to the fold class membership by using simple similarity measures in the descriptor spaces, such as the cosine angle. {O}ur results revealed that the three best-performing sets of descriptors were the sequence-alignment-based descriptor using {PSI}-{BLAST} e-values, the descriptors based on the alignment of secondary structural elements ({SSEA}), and the descriptors based on the occurrence of {PROSITE} functional motifs. {I}n a second phase, the three top-performing descriptors were combined to obtain a final method with improved performance, which we named {D}esc{F}old. {C}lass membership was predicted by {S}upport {V}ector {M}achine ({SVM}) learning. {I}n comparison with the individual {PSI}-{BLAST}-based descriptor, the rate of remote homology identification increased from 33.7% to 46.3%. {W}e found out that the composite set of descriptors was able to identify the true remote homolog for nearly every sixth sequence at the 95% confidence level, or some 10% more than a single {PSI}-{BLAST} search. {W}e have benchmarked the {D}esc{F}old method against several other state-of-the-art fold recognition algorithms for the 172 {L}ive{B}ench-8 targets, and we concluded that it was able to add value to the existing techniques by providing a confident hit for at least 10% of the sequences not identifiable by the previously known methods.}, doi = {10.1110/ps.041035505}, pdf = {../local/Zhang2005Descriptor-based.pdf}, file = {Zhang2005Descriptor-based.pdf:local/Zhang2005Descriptor-based.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://dx.doi.org/10.1110/ps.041035505} }
@article{Zhao2003Application, author = {Zhao, Y. and Pinilla, C. and Valmori, D. and Martin, R. and Simon, R.}, title = {Application of support vector machines for {T}-cell epitopes prediction}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {1978-1984}, number = {15}, abstract = {Motivation: {T}he {T}-cell receptor, a major histocompatibility complex ({MHC}) molecule, and a bound antigenic peptide, play major roles in the process of antigen-specific {T}-cell activation. {T}-cell recognition was long considered exquisitely specific. {R}ecent data also indicate that it is highly flexible, and one receptor may recognize thousands of different peptides. {D}eciphering the patterns of peptides that elicit a {MHC} restricted {T}-cell response is critical for vaccine development. {R}esults: {F}or the first time we develop a support vector machine ({SVM}) for {T}-cell epitope prediction with an {MHC} type {I} restricted {T}-cell clone. {U}sing cross-validation, we demonstrate that {SVM}s can be trained on relatively small data sets to provide prediction more accurate than those based on previously published methods or on {MHC} binding. {S}upplementary information: {D}ata for 203 synthesized peptides is available at http://linus.nci.nih.gov/{D}ata/{LAU}203_{P}eptide.pdf}, pdf = {../local/Zhao2003Application.pdf}, file = {Zhao2003Application.pdf:local/Zhao2003Application.pdf:PDF}, keywords = {biosvm immunoinformatics}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1978} }
@article{Zhou2005Recognition, author = {GuoDong Zhou and Dan Shen and Jie Zhang and Jian Su and SoonHeng Tan}, title = {Recognition of protein/gene names from text using an ensemble of classifiers.}, journal = {B{MC} {B}ioinformatics}, year = {2005}, volume = {6 Suppl 1}, pages = {S7}, abstract = {This paper proposes an ensemble of classifiers for biomedical name recognition in which three classifiers, one {S}upport {V}ector {M}achine and two discriminative {H}idden {M}arkov {M}odels, are combined effectively using a simple majority voting strategy. {I}n addition, we incorporate three post-processing modules, including an abbreviation resolution module, a protein/gene name refinement module and a simple dictionary matching module, into the system to further improve the performance. {E}valuation shows that our system achieves the best performance from among 10 systems with a balanced {F}-measure of 82.58 on the closed evaluation of the {B}io{C}reative protein/gene name recognition task ({T}ask 1{A}).}, doi = {10.1186/1471-2105-6-S1-S7}, pdf = {../local/Zhou2005Recognition.pdf}, file = {Zhou2005Recognition.pdf:local/Zhou2005Recognition.pdf:PDF}, keywords = {biosvm nlp}, pii = {1471-2105-6-S1-S7}, url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S7} }
@article{Zhou2005LS, author = {Xin Zhou and K. Z. Mao}, title = {L{S} {B}ound based gene selection for {DNA} microarray data.}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {1559-64}, number = {8}, month = {Apr}, abstract = {M{OTIVATION}: {O}ne problem with discriminant analysis of {DNA} microarray data is that each sample is represented by quite a large number of genes, and many of them are irrelevant, insignificant or redundant to the discriminant problem at hand. {M}ethods for selecting important genes are, therefore, of much significance in microarray data analysis. {I}n the present study, a new criterion, called {LS} {B}ound measure, is proposed to address the gene selection problem. {T}he {LS} {B}ound measure is derived from leave-one-out procedure of {LS}-{SVM}s (least squares support vector machines), and as the upper bound for leave-one-out classification results it reflects to some extent the generalization performance of gene subsets. {RESULTS}: {W}e applied this {LS} {B}ound measure for gene selection on two benchmark microarray datasets: colon cancer and leukemia. {W}e also compared the {LS} {B}ound measure with other evaluation criteria, including the well-known {F}isher's ratio and {M}ahalanobis class separability measure, and other published gene selection algorithms, including {W}eighting factor and {SVM} {R}ecursive {F}eature {E}limination. {T}he strength of the {LS} {B}ound measure is that it provides gene subsets leading to more accurate classification results than the filter method while its computational complexity is at the level of the filter method. {AVAILABILITY}: {A} companion website can be accessed at http://www.ntu.edu.sg/home5/pg02776030/lsbound/. {T}he website contains: (1) the source code of the gene selection algorithm; (2) the complete set of tables and figures regarding the experimental study; (3) proof of the inequality (9). {CONTACT}: ekzmao@ntu.edu.sg.}, doi = {10.1093/bioinformatics/bti216}, pdf = {../local/Zhou2005LS.pdf}, file = {Zhou2005LS.pdf:local/Zhou2005LS.pdf:PDF}, keywords = {biosvm featureselection microarray}, pii = {bti216}, url = {http://dx.doi.org/10.1093/bioinformatics/bti216} }
@article{Zien2000Engineering, author = {Zien, A. and R{\"a}tsch, G. and Mika, S. and Sch{\"o}lkopf, B. and Lengauer, T. and M{\"u}ller, K.-R.}, title = {Engineering support vector machine kernels that recognize translation initiation sites}, journal = {Bioinformatics}, year = {2000}, volume = {16}, pages = {799-807}, number = {9}, abstract = {Motivation: {I}n order to extract protein sequences from nucleotide sequences, it is an important step to recognize points at which regions start that code for proteins. {T}hese points are called translation initiation sites ({TIS}). {R}esults: {T}he task of finding {TIS} can be modeled as a classification problem. {W}e demonstrate the applicability of support vector machines for this task, and show how to incorporate prior biological knowledge by engineering an appropriate kernel function. {W}ith the described techniques the recognition performance can be improved by 26% over leading existing approaches. {W}e provide evidence that existing related methods (e.g. {ESTS}can) could profit from advanced {TIS} recognition.}, pdf = {../local/Zien2000Engineering.pdf}, file = {Zien2000Engineering.pdf:local/Zien2000Engineering.pdf:PDF}, keywords = {biosvm}, owner = {jeanphilippevert}, url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/16/9/799} }
@comment{{jabref-meta: selector_author:}}
@comment{{jabref-meta: selector_journal:Adv. Drug Deliv. Rev.;Am. J. Hu m. Genet.;Am. J. Pathol.;Ann. Appl. Stat.;Ann. Math. Statist.;Ann. N. Y. Acad. Sci.;Ann. Probab.;Ann. Stat.;Artif. Intell. Med.;Bernoulli;Bi ochim. Biophys. Acta;Bioinformatics;Biometrika;BMC Bioinformatics;Br. J. Pharmacol.;Breast Cancer Res.;Cell;Cell. Signal.;Chem. Res. Toxicol .;Clin. Cancer Res.;Combinator. Probab. Comput.;Comm. Pure Appl. Math. ;Comput. Chem.;Comput. Comm. Rev.;Comput. Stat. Data An.;Curr. Genom.; Curr. Opin. Chem. Biol.;Curr. Opin. Drug Discov. Devel.;Data Min. Know l. Discov.;Electron. J. Statist.;Eur. J. Hum. Genet.;FEBS Lett.;Found. Comput. Math.;Genome Biol.;IEEE T. Neural Networ.;IEEE T. Pattern. An al.;IEEE T. Signal. Proces.;IEEE Trans. Inform. Theory;IEEE Trans. Kno wl. Data Eng.;IEEE/ACM Trans. Comput. Biol. Bioinf.;Int. J. Comput. Vi sion;Int. J. Data Min. Bioinform.;Int. J. Qantum Chem.;J Biol Syst;J. ACM;J. Am. Soc. Inf. Sci. Technol.;J. Am. Stat. Assoc.;J. Bioinform. C omput. Biol.;J. Biol. Chem.;J. Biomed. Inform.;J. Cell. Biochem.;J. Ch em. Inf. Comput. Sci.;J. Chem. Inf. Model.;J. Clin. Oncol.;J. Comput. Biol.;J. Comput. Graph. Stat.;J. Eur. Math. Soc.;J. Intell. Inform. Sy st.;J. Mach. Learn. Res.;J. Med. Chem.;J. Mol. BIol.;J. R. Stat. Soc. Ser. B;Journal of Statistical Planning and Inference;Mach. Learn.;Math . Program.;Meth. Enzymol.;Mol. Biol. Cell;Mol. Biol. Evol.;Mol. Cell. Biol.;Mol. Syst. Biol.;N. Engl. J. Med.;Nat. Biotechnol.;Nat. Genet.;N at. Med.;Nat. Methods;Nat. Rev. Cancer;Nat. Rev. Drug Discov.;Nat. Rev . Genet.;Nature;Neural Comput.;Neural Network.;Neurocomputing;Nucleic Acids Res.;Pattern Anal. Appl.;Pattern Recognit.;Phys. Rev. E;Phys. Re v. Lett.;PLoS Biology;PLoS Comput. Biol.;Probab. Theory Relat. Fields; Proc. IEEE;Proc. Natl. Acad. Sci. USA;Protein Eng.;Protein Eng. Des. S el.;Protein Sci.;Protein. Struct. Funct. Genet.;Random Struct. Algorit hm.;Rev. Mod. Phys.;Science;Stat. Probab. Lett.;Statistica Sinica;Theo r. Comput. Sci.;Trans. Am. Math. Soc.;Trends Genet.;}}
@comment{{jabref-meta: selector_keywords:biogm;biosvm;breastcancer;cgh; chemogenomics;chemoinformatics;csbcbook;csbcbook-ch1;csbcbook-ch2;csbc book-ch3;csbcbook-ch4;csbcbook-ch5;csbcbook-ch6;csbcbook-ch7;csbcbook- ch8;csbcbook-ch9;csbcbook-mustread;dimred;featureselection;glycans;her g;hic;highcontentscreening;image;immunoinformatics;kernel-theory;kerne lbook;lasso;microarray;ngs;nlp;plasmodium;proteomics;PUlearning;rnaseq ;segmentation;sirna;}}
@comment{{jabref-meta: selector_booktitle:Adv. Neural. Inform. Process Syst.;}}
This file was generated by bibtex2html 1.97.