bio.bib

@comment{{This file has been generated by bib2bib 1.97}}
@comment{{Command line: bib2bib ../bibli.bib -c 'subject:"bio" or keywords:"bio"' -ob tmp.bib}}
@article{Tayrac2009Simultaneous,
  author = {{de Tayrac}, M. and L\^e, S. and Aubry, M. and Mosser, J. and Husson,
	F.},
  title = {Simultaneous analysis of distinct Omics data sets with integration
	of biological knowledge: Multiple Factor Analysis approach.},
  journal = {BMC Genomics},
  year = {2009},
  volume = {10},
  pages = {32},
  abstract = {Genomic analysis will greatly benefit from considering in a global
	way various sources of molecular data with the related biological
	knowledge. It is thus of great importance to provide useful integrative
	approaches dedicated to ease the interpretation of microarray data.Here,
	we introduce a data-mining approach, Multiple Factor Analysis (MFA),
	to combine multiple data sets and to add formalized knowledge. MFA
	is used to jointly analyse the structure emerging from genomic and
	transcriptomic data sets. The common structures are underlined and
	graphical outputs are provided such that biological meaning becomes
	easily retrievable. Gene Ontology terms are used to build gene modules
	that are superimposed on the experimentally interpreted plots. Functional
	interpretations are then supported by a step-by-step sequence of
	graphical representations.When applied to genomic and transcriptomic
	data and associated Gene Ontology annotations, our method prioritize
	the biological processes linked to the experimental settings. Furthermore,
	it reduces the time and effort to analyze large amounts of 'Omics'
	data.},
  doi = {10.1186/1471-2164-10-32},
  institution = {CNRS UMR 6061, Université de Rennes 1, IFR 140, Faculté de Médecine,
	CS 34317, 35043 Rennes, France. marie.de-tayrac@univ-rennes1.fr},
  keywords = {Animals; Comparative Genomic Hybridization; Factor Analysis, Statistical;
	Gene Expression Profiling, methods; Genomics, methods; Glioma, genetics;
	Humans; Mice; Models, Biological; Oligonucleotide Array Sequence
	Analysis, methods},
  language = {eng},
  medline-pst = {epublish},
  owner = {jp},
  pii = {1471-2164-10-32},
  pmid = {19154582},
  timestamp = {2012.02.29},
  url = {http://dx.doi.org/10.1186/1471-2164-10-32}
}
@article{Consortium2010map,
  author = {{1000 Genomes Project Consortium}},
  title = {A map of human genome variation from population-scale sequencing.},
  journal = {Nature},
  year = {2010},
  volume = {467},
  pages = {1061--1073},
  number = {7319},
  month = {Oct},
  abstract = {The 1000 Genomes Project aims to provide a deep characterization of
	human genome sequence variation as a foundation for investigating
	the relationship between genotype and phenotype. Here we present
	results of the pilot phase of the project, designed to develop and
	compare different strategies for genome-wide sequencing with high-throughput
	platforms. We undertook three projects: low-coverage whole-genome
	sequencing of 179 individuals from four populations; high-coverage
	sequencing of two mother-father-child trios; and exon-targeted sequencing
	of 697 individuals from seven populations. We describe the location,
	allele frequency and local haplotype structure of approximately 15
	million single nucleotide polymorphisms, 1 million short insertions
	and deletions, and 20,000 structural variants, most of which were
	previously undescribed. We show that, because we have catalogued
	the vast majority of common variation, over 95\% of the currently
	accessible variants found in any individual are present in this data
	set. On average, each person is found to carry approximately 250
	to 300 loss-of-function variants in annotated genes and 50 to 100
	variants previously implicated in inherited disorders. We demonstrate
	how these results can be used to inform association and functional
	studies. From the two trios, we directly estimate the rate of de
	novo germline base substitution mutations to be approximately 10(-8)
	per base pair per generation. We explore the data with regard to
	signatures of natural selection, and identify a marked reduction
	of genetic variation in the neighbourhood of genes, due to selection
	at linked sites. These methods and public data will support the next
	phase of human genetic research.},
  doi = {10.1038/nature09534},
  keywords = {Calibration; Chromosomes, Human, Y, genetics; Computational Biology;
	DNA Mutational Analysis; DNA, Mitochondrial, genetics; Evolution,
	Molecular; Female; Genetic Association Studies; Genetic Variation,
	genetics; Genetics, Population, methods; Genome, Human, genetics;
	Genome-Wide Association Study; Genomics, methods; Genotype; Haplotypes,
	genetics; Humans; Male; Mutation, genetics; Pilot Projects; Polymorphism,
	Single Nucleotide, genetics; Recombination, Genetic, genetics; Sample
	Size; Selection, Genetic, genetics; Sequence Alignment; Sequence
	Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {nature09534},
  pmid = {20981092},
  timestamp = {2012.02.24},
  url = {http://dx.doi.org/10.1038/nature09534}
}
@article{Achard2001XML,
  author = {F. Achard and G. Vaysseix and E. Barillot},
  title = {XML, bioinformatics and data integration.},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {115--125},
  number = {2},
  month = {Feb},
  abstract = {Motivation: The eXtensible Markup Language (XML) is an emerging standard
	for structuring documents, notably for the World Wide Web. In this
	paper, the authors present XML and examine its use as a data language
	for bioinformatics. In particular, XML is compared to other languages,
	and some of the potential uses of XML in bioinformatics applications
	are presented. The authors propose to adopt XML for data interchange
	between databases and other sources of data. Finally the discussion
	is illustrated by a test case of a pedigree data model in XML. Contact:
	Emmanuel.Barillot@infobiogen.fr},
  institution = {CRI Infobiogen, 523 place des terrasses de l'agora, 91000 Evry, France.},
  keywords = {Computational Biology; Humans; Information Storage and Retrieval;
	Internet; Programming Languages},
  language = {eng},
  medline-pst = {ppublish},
  owner = {phupe},
  pmid = {11238067},
  timestamp = {2011.06.01}
}
@article{Aebersold2003Mass,
  author = {Aebersold, R. and Mann, M.},
  title = {Mass spectrometry-based proteomics},
  journal = {Nature},
  year = {2003},
  volume = {422},
  pages = {198-207},
  number = {6928},
  month = {Mar},
  abstract = {Recent successes illustrate the role of mass spectrometry-based proteomics
	as an indispensable tool for molecular and cellular biology and for
	the emerging field of systems biology. {T}hese include the study
	of protein-protein interactions via affinity-based isolations on
	a small and proteome-wide scale, the mapping of numerous organelles,
	the concurrent description of the malaria parasite genome and proteome,
	and the generation of quantitative protein profiles from diverse
	species. {T}he ability of mass spectrometry to identify and, increasingly,
	to precisely quantify thousands of proteins from complex samples
	can be expected to impact broadly on biology and medicine.},
  comment = {A good ref for the detection of protein-protein interactions by coimmunoprecipitation
	followed by mass spectrometry},
  doi = {10.1038/nature01511},
  pdf = {../local/Aebersold2003Mass.pdf},
  file = {Aebersold2003Mass.pdf:Aebersold2003Mass.pdf:PDF},
  keywords = {bio},
  owner = {vert},
  url = {http://dx.doi.org/10.1038/nature01511}
}
@article{Aires-de-Sousa2005Prediction,
  author = {Aires-de-Sousa, J. and Gasteiger, J.},
  title = {Prediction of enantiomeric excess in a combinatorial library of catalytic
	enantioselective reactions.},
  journal = {J {C}omb {C}hem},
  year = {2005},
  volume = {7},
  pages = {298-301},
  number = {2},
  abstract = {A quantitative structure-enantioselectivity relationship was established
	for a combinatorial library of enantioselective reactions performed
	by addition of diethyl zinc to benzaldehyde. {C}hiral catalysts and
	additives were encoded by their chirality codes and presented as
	input to neural networks. {T}he networks were trained to predict
	the enantiomeric excess. {W}ith independent test sets, predictions
	of enantiomeric excess could be made with an average error as low
	as 6\% ee. {M}ultilinear regression, perceptrons, and support vector
	machines were also evaluated as modeling tools. {T}he method is of
	interest for the computer-aided design of combinatorial libraries
	involving chiral compounds or enantioselective reactions. {T}his
	is the first example of a quantitative structure-property relationship
	based on chirality codes.},
  doi = {10.1021/cc049961q},
  pdf = {../local/Aires-de-Sousa2005Prediction.pdf},
  file = {Aires-de-Sousa2005Prediction.pdf:local/Aires-de-Sousa2005Prediction.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/cc049961q}
}
@article{Akutsu2000Inferring,
  author = {T. Akutsu and S. Miyano and S. Kuhara},
  title = {Inferring qualitative relations in genetic networks and metabolic
	pathways},
  journal = {Bioinformatics},
  year = {2000},
  volume = {16},
  pages = {727--734},
  number = {8},
  pdf = {../local/Akutsu2000Inferring.pdf},
  file = {Akutsu2000Inferring.pdf:local/Akutsu2000Inferring.pdf:PDF},
  subject = {bionet},
  url = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/16/8/727}
}
@article{Ala2008Prediction,
  author = {Ala, U. and Piro, R.M. and Grassi, E. and Damasco, C. and Silengo,
	L. and Oti, M. and Provero, P. and Di Cunto, F.},
  title = {Prediction of human disease genes by human-mouse conserved coexpression
	analysis.},
  journal = {PLoS Comput. Biol.},
  year = {2008},
  volume = {4},
  pages = {e1000043},
  number = {3},
  month = {Mar},
  abstract = {BACKGROUND: Even in the post-genomic era, the identification of candidate
	genes within loci associated with human genetic diseases is a very
	demanding task, because the critical region may typically contain
	hundreds of positional candidates. Since genes implicated in similar
	phenotypes tend to share very similar expression profiles, high throughput
	gene expression data may represent a very important resource to identify
	the best candidates for sequencing. However, so far, gene coexpression
	has not been used very successfully to prioritize positional candidates.
	METHODOLOGY/PRINCIPAL FINDINGS: We show that it is possible to reliably
	identify disease-relevant relationships among genes from massive
	microarray datasets by concentrating only on genes sharing similar
	expression profiles in both human and mouse. Moreover, we show systematically
	that the integration of human-mouse conserved coexpression with a
	phenotype similarity map allows the efficient identification of disease
	genes in large genomic regions. Finally, using this approach on 850
	OMIM loci characterized by an unknown molecular basis, we propose
	high-probability candidates for 81 genetic diseases. CONCLUSION:
	Our results demonstrate that conserved coexpression, even at the
	human-mouse phylogenetic distance, represents a very strong criterion
	to predict disease-relevant relationships among human genes.},
  doi = {10.1371/journal.pcbi.1000043},
  institution = {Molecular Biotechnology Center, Department of Genetics, Biology and
	Biochemistry, University of Turin, Turin, Italy.},
  keywords = {Algorithms; Animals; Biological Markers; Chromosome Mapping; Conserved
	Sequence; Diagnosis, Computer-Assisted; Gene Expression Profiling;
	Genetic Diseases, Inborn; Genetic Predisposition to Disease; Humans;
	Mice; Proteome},
  owner = {mordelet},
  pmid = {18369433},
  timestamp = {2010.09.28},
  url = {http://dx.doi.org/10.1371/journal.pcbi.1000043}
}
@article{Alexandersson2003SLAM,
  author = {Alexandersson, M. and Cawley, S. and Pachter, L.},
  title = {S{LAM}: cross-species gene finding and alignment with a generalized
	pair hidden {M}arkov model.},
  journal = {Genome {R}es.},
  year = {2003},
  volume = {13},
  pages = {496--502},
  number = {3},
  month = {Mar},
  abstract = {Comparative-based gene recognition is driven by the principle that
	conserved regions between related organisms are more likely than
	divergent regions to be coding. {W}e describe a probabilistic framework
	for gene structure and alignment that can be used to simultaneously
	find both the gene structure and alignment of two syntenic genomic
	regions. {A} key feature of the method is the ability to enhance
	gene predictions by finding the best alignment between two syntenic
	sequences, while at the same time finding biologically meaningful
	alignments that preserve the correspondence between coding exons.
	{O}ur probabilistic framework is the generalized pair hidden {M}arkov
	model, a hybrid of (1). generalized hidden {M}arkov models, which
	have been used previously for gene finding, and (2). pair hidden
	{M}arkov models, which have applications to sequence alignment. {W}e
	have built a gene finding and alignment program called {SLAM}, which
	aligns and identifies complete exon/intron structures of genes in
	two related but unannotated sequences of {DNA}. {SLAM} is able to
	reliably predict gene structures for any suitably related pair of
	organisms, most notably with fewer false-positive predictions compared
	to previous methods (examples are provided for {H}omo sapiens/{M}us
	musculus and {P}lasmodium falciparum/{P}lasmodium vivax comparisons).
	{A}ccuracy is obtained by distinguishing conserved noncoding sequence
	({CNS}) from conserved coding sequence. {CNS} annotation is a novel
	feature of {SLAM} and may be useful for the annotation of {UTR}s,
	regulatory elements, and other noncoding features.},
  doi = {10.1101/gr.424203},
  pdf = {../local/Alexandersson2003SLAM.pdf},
  file = {Alexandersson2003SLAM.pdf:local/Alexandersson2003SLAM.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pmid = {12618381},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1101/gr.424203}
}
@inproceedings{Aliferis2002Machine,
  author = {Aliferis, C.F. and Hardin, D.P. and Massion, P.},
  title = {Machine {L}earning {M}odels {F}or {L}ung {C}ancer {C}lassification
	{U}sing {A}rray {C}omparative {G}enomic {H}ybridization},
  booktitle = {Proceedings of the 2002 {A}merican {M}edical {I}nformatics {A}ssociation
	({AMIA}) {A}nnual {S}ymposium},
  year = {2002},
  pages = {7-11},
  abstract = {Array {CGH} is a recently introduced technology that measures changes
	in the gene copy number of hundreds of genes in a single experiment.
	{T}he primary goal of this study was to develop machine learning
	models that classify non-small {L}ung {C}ancers according to histopathology
	types and to compare several machine learning methods in this learning
	task. {DNA} from tumors of 37 patients (21 squamous carcinomas, and
	16 adenocarcinomas) were extracted and hybridized onto a 452 {BAC}
	clone array. {T}he following algorithms were used: {KNN}, {D}ecision
	{T}ree {I}nduction, {S}upport {V}ector {M}achines and {F}eed-{F}orward
	{N}eural {N}etworks. {P}erformance was measured via leave-one-out
	classification accuracy. {T}he best multi-gene model found had a
	leave-one-out accuracy of 89.2\%. {D}ecision {T}rees performed poorer
	than the other methods in this learning task and dataset. {W}e conclude
	that gene copy numbers as measured by array {CGH} are, collectively,
	an excellent indicator of histological subtype. {S}everal interesting
	research directions are discussed.},
  pdf = {../local/Aliferis2002Machine.pdf},
  file = {Aliferis2002Machine.pdf:local/Aliferis2002Machine.pdf:PDF},
  keywords = {biosvm microarray, cgh},
  owner = {jeanphilippevert}
}
@article{Altschul1997Gapped,
  author = {S.F. Altschul and T.L. Madden and A.A. Schaffer and J. Zhang and
	Z. Zhang and W. Miller and D.J. Lipman},
  title = {Gapped {BLAST} and {PSI}-{BLAST}: {A} new generation of protein database
	search programs},
  journal = {Nucleic {A}cids {R}esearch},
  year = {1997},
  volume = {25},
  pages = {3389--3402},
  pdf = {../local/alts97.pdf},
  file = {alts97.pdf:local/alts97.pdf:PDF},
  subject = {biocasp},
  url = {http://nar.oupjournals.org/cgi/reprint/25/17/3389.pdf}
}
@article{Ambroise2002Selection,
  author = {Ambroise, C. and McLachlan, G.J.},
  title = {Selection bias in gene extraction on the basis of microarray gene-expression
	data},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {2002},
  volume = {99},
  pages = {6562-6566},
  number = {10},
  abstract = {In the context of cancer diagnosis and treatment, we consider the
	problem of constructing an accurate prediction rule on the basis
	of a relatively small number of tumor tissue samples of known type
	containing the expression data on very many (possibly thousands)
	genes. {R}ecently, results have been presented in the literature
	suggesting that it is possible to construct a prediction rule from
	only a few genes such that it has a negligible prediction error rate.
	{H}owever, in these results the test error or the leave-one-out cross-validated
	error is calculated without allowance for the selection bias. {T}here
	is no allowance because the rule is either tested on tissue samples
	that were used in the first instance to select the genes being used
	in the rule or because the cross-validation of the rule is not external
	to the selection process; that is, gene selection is not performed
	in training the rule at each stage of the cross-validation process.
	{W}e describe how in practice the selection bias can be assessed
	and corrected for by either performing a cross-validation or applying
	the bootstrap external to the selection process. {W}e recommend using
	10-fold rather than leave-one-out cross-validation, and concerning
	the bootstrap, we suggest using the so-called .632+ bootstrap error
	estimate designed to handle overfitted prediction rules. {U}sing
	two published data sets, we demonstrate that when correction is made
	for the selection bias, the cross-validated error is no longer zero
	for a subset of only a few genes.},
  pdf = {../local/Ambroise2002Selection.pdf},
  file = {Ambroise2002Selection.pdf:local/Ambroise2002Selection.pdf:PDF},
  keywords = {featureselection biosvm},
  owner = {jeanphilippevert},
  url = {http://www.pnas.org/cgi/content/abstract/99/10/6562}
}
@article{Anderson2003new,
  author = {Anderson, D.C. and Li, W. and Payan, D.G. and Noble, W.S.},
  title = {A new algorithm for the evaluation of shotgun peptide sequencing
	in proteomics: support vector machine classification of peptide {{MS}/{MS}}
	spectra and {SEQUEST} scores.},
  journal = {J {P}roteome {R}es},
  year = {2003},
  volume = {2},
  pages = {137-146},
  number = {2},
  abstract = {Shotgun tandem mass spectrometry-based peptide sequencing using programs
	such as {SEQUEST} allows high-throughput identification of peptides,
	which in turn allows the identification of corresponding proteins.
	{W}e have applied a machine learning algorithm, called the support
	vector machine, to discriminate between correctly and incorrectly
	identified peptides using {SEQUEST} output. {E}ach peptide was characterized
	by {SEQUEST}-calculated features such as delta {C}n and {X}corr,
	measurements such as precursor ion current and mass, and additional
	calculated parameters such as the fraction of matched {MS}/{MS} peaks.
	{T}he trained {SVM} classifier performed significantly better than
	previous cutoff-based methods at separating positive from negative
	peptides. {P}ositive and negative peptides were more readily distinguished
	in training set data acquired on a {QTOF}, compared to an ion trap
	mass spectrometer. {T}he use of 13 features, including four new parameters,
	significantly improved the separation between positive and negative
	peptides. {U}se of the support vector machine and these additional
	parameters resulted in a more accurate interpretation of peptide
	{MS}/{MS} spectra and is an important step toward automated interpretation
	of peptide tandem mass spectrometry data in proteomics.},
  pdf = {../local/Anderson2003new.pdf},
  file = {Anderson2003new.pdf:local/Anderson2003new.pdf:PDF},
  keywords = {biosvm proteomics},
  owner = {jeanphilippevert}
}
@article{Aphinyanaphongs2005Text,
  author = {Yindalon Aphinyanaphongs and Ioannis Tsamardinos and Alexander Statnikov
	and Douglas Hardin and Constantin F Aliferis},
  title = {Text categorization models for high-quality article retrieval in
	internal medicine.},
  journal = {J. {A}m. {M}ed. {I}nform. {A}ssoc.},
  year = {2005},
  volume = {12},
  pages = {207-16},
  number = {2},
  abstract = {O{BJECTIVE} {F}inding the best scientific evidence that applies to
	a patient problem is becoming exceedingly difficult due to the exponential
	growth of medical publications. {T}he objective of this study was
	to apply machine learning techniques to automatically identify high-quality,
	content-specific articles for one time period in internal medicine
	and compare their performance with previous {B}oolean-based {P}ub{M}ed
	clinical query filters of {H}aynes et al. {DESIGN} {T}he selection
	criteria of the {ACP} {J}ournal {C}lub for articles in internal medicine
	were the basis for identifying high-quality articles in the areas
	of etiology, prognosis, diagnosis, and treatment. {N}aive {B}ayes,
	a specialized {A}da{B}oost algorithm, and linear and polynomial support
	vector machines were applied to identify these articles. {MEASUREMENTS}
	{T}he machine learning models were compared in each category with
	each other and with the clinical query filters using area under the
	receiver operating characteristic curves, 11-point average recall
	precision, and a sensitivity/specificity match method. {RESULTS}
	{I}n most categories, the data-induced models have better or comparable
	sensitivity, specificity, and precision than the clinical query filters.
	{T}he polynomial support vector machine models perform the best among
	all learning methods in ranking the articles as evaluated by area
	under the receiver operating curve and 11-point average recall precision.
	{CONCLUSION} {T}his research shows that, using machine learning methods,
	it is possible to automatically build models for retrieving high-quality,
	content-specific articles using inclusion or citation by the {ACP}
	{J}ournal {C}lub as a gold standard in a given time period in internal
	medicine that perform better than the 1994 {P}ub{M}ed clinical query
	filters.},
  doi = {10.1197/jamia.M1641},
  pdf = {../local/Aphinyanaphongs2005Text.pdf},
  file = {Aphinyanaphongs2005Text.pdf:local/Aphinyanaphongs2005Text.pdf:PDF},
  keywords = {biosvm nlp},
  pii = {M1641},
  url = {http://dx.doi.org/10.1197/jamia.M1641}
}
@article{Aranda2010IntAct,
  author = {B. Aranda and P. Achuthan and Y. Alam-Faruque and I. Armean and A.
	Bridge and C. Derow and M. Feuermann and A. T. Ghanbarian and S.
	Kerrien and J. Khadake and J. Kerssemakers and C. Leroy and M. Menden
	and M. Michaut and L. Montecchi-Palazzi and S. N. Neuhauser and S.
	Orchard and V. Perreau and B. Roechert and K. van Eijk and H. Hermjakob},
  title = {The IntAct molecular interaction database in 2010.},
  journal = {Nucleic Acids Res},
  year = {2010},
  volume = {38},
  pages = {D525--D531},
  number = {Database issue},
  month = {Jan},
  abstract = {IntAct is an open-source, open data molecular interaction database
	and toolkit. Data is abstracted from the literature or from direct
	data depositions by expert curators following a deep annotation model
	providing a high level of detail. As of September 2009, IntAct contains
	over 200.000 curated binary interaction evidences. In response to
	the growing data volume and user requests, IntAct now provides a
	two-tiered view of the interaction data. The search interface allows
	the user to iteratively develop complex queries, exploiting the detailed
	annotation with hierarchical controlled vocabularies. Results are
	provided at any stage in a simplified, tabular view. Specialized
	views then allows 'zooming in' on the full annotation of interactions,
	interactors and their properties. IntAct source code and data are
	freely available at http://www.ebi.ac.uk/intact.},
  doi = {10.1093/nar/gkp878},
  institution = {EMBL Outstation, European Bioinformatics Institute, Wellcome Trust
	Genome Campus Hinxton, Cambridge CB10 1SD, UK.},
  keywords = {Animals; Computational Biology; Databases, Genetic; Databases, Protein;
	False Positive Reactions; Humans; Information Storage and Retrieval;
	Internet; Programming Languages; Protein Interaction Mapping; Protein
	Structure, Tertiary; Proteins; Software; User-Computer Interface;
	Vocabulary, Controlled},
  owner = {fantine},
  pii = {gkp878},
  pmid = {19850723},
  timestamp = {2010.10.21},
  url = {http://dx.doi.org/10.1093/nar/gkp878}
}
@article{Arimoto2005Development,
  author = {Rieko Arimoto and Madhu-Ashni Prasad and Eric M Gifford},
  title = {Development of {CYP}3{A}4 inhibition models: comparisons of machine-learning
	techniques and molecular descriptors.},
  journal = {J {B}iomol {S}creen},
  year = {2005},
  volume = {10},
  pages = {197-205},
  number = {3},
  month = {Apr},
  abstract = {Computational models of cytochrome {P}450 3{A}4 inhibition were developed
	based on high-throughput screening data for 4470 proprietary compounds.
	{M}ultiple models differentiating inhibitors ({IC}(50) <3 micro{M})
	and noninhibitors were generated using various machine-learning algorithms
	(recursive partitioning [{RP}], {B}ayesian classifier, logistic regression,
	k-nearest-neighbor, and support vector machine [{SVM}]) with structural
	fingerprints and topological indices. {N}ineteen models were evaluated
	by internal 10-fold cross-validation and also by an independent test
	set. {T}hree most predictive models, {B}arnard {C}hemical {I}nformation
	({BCI})-fingerprint/{SVM}, {MDL}-keyset/{SVM}, and topological indices/{RP},
	correctly classified 249, 248, and 236 compounds of 291 noninhibitors
	and 135, 137, and 147 compounds of 179 inhibitors in the validation
	set. {T}heir overall accuracies were 82\%, 82\%, and 81\%, respectively.
	{I}nvestigating applicability of the {BCI}/{SVM} model found a strong
	correlation between the predictive performance and the structural
	similarity to the training set. {U}sing {T}animoto similarity index
	as a confidence measurement for the predictions, the limitation of
	the extrapolation was 0.7 in the case of the {BCI}/{SVM} model. {T}aking
	consensus of the 3 best models yielded a further improvement in predictive
	capability, kappa = 0.65 and accuracy = 83\%. {T}he consensus model
	could also be tuned to minimize either false positives or false negatives
	depending on the emphasis of the screening.},
  doi = {10.1177/1087057104274091},
  keywords = {biosvm chemoinformatics},
  pii = {10/3/197},
  url = {http://dx.doi.org/10.1177/1087057104274091}
}
@article{Arodz2005Pattern,
  author = {Tomasz Arod{\'z} and Marcin Kurdziel and Erik O D Sevre and David
	A Yuen},
  title = {Pattern recognition techniques for automatic detection of suspicious-looking
	anomalies in mammograms.},
  journal = {Comput. {M}ethods {P}rograms {B}iomed.},
  year = {2005},
  volume = {79},
  pages = {135-49},
  number = {2},
  month = {Aug},
  abstract = {We have employed two pattern recognition methods used commonly for
	face recognition in order to analyse digital mammograms. {T}he methods
	are based on novel classification schemes, the {A}da{B}oost and the
	support vector machines ({SVM}). {A} number of tests have been carried
	out to evaluate the accuracy of these two algorithms under different
	circumstances. {R}esults for the {A}da{B}oost classifier method are
	promising, especially for classifying mass-type lesions. {I}n the
	best case the algorithm achieved accuracy of 76\% for all lesion
	types and 90\% for masses only. {T}he {SVM} based algorithm did not
	perform as well. {I}n order to achieve a higher accuracy for this
	method, we should choose image features that are better suited for
	analysing digital mammograms than the currently used ones.},
  doi = {10.1016/j.cmpb.2005.03.009},
  pdf = {../local/Arodz2005Pattern.pdf},
  file = {Arodz2005Pattern.pdf:local/Arodz2005Pattern.pdf:PDF},
  keywords = {biosvm image},
  pii = {S0169-2607(05)00083-0},
  url = {http://dx.doi.org/10.1016/j.cmpb.2005.03.009}
}
@article{Ashburner2000Gene,
  author = {M. Ashburner and C. A. Ball and J. A. Blake and D. Botstein and H.
	Butler and J. M. Cherry and A. P. Davis and K. Dolinski and S. S.
	Dwight and J. T. Eppig and M. A. Harris and D. P. Hill and L. Issel-Tarver
	and A. Kasarskis and S. Lewis and J. C. Matese and J. E. Richardson
	and M. Ringwald and G. M. Rubin and G. Sherlock},
  title = {Gene ontology: tool for the unification of biology. The Gene Ontology
	Consortium.},
  journal = {Nat Genet},
  year = {2000},
  volume = {25},
  pages = {25--29},
  number = {1},
  month = {May},
  doi = {10.1038/75556},
  institution = {Department of Genetics, Stanford University School of Medicine, California,
	USA. cherry@stanford.edu},
  keywords = {Animals; Computer Communication Networks; Databases, Factual; Eukaryotic
	Cells; Genes; Humans; Metaphysics; Mice; Molecular Biology; Sequence
	Analysis, DNA; Terminology as Topic},
  owner = {fantine},
  pmid = {10802651},
  timestamp = {2010.10.25},
  url = {http://dx.doi.org/10.1038/75556}
}
@article{Atalay2005Implicit,
  author = {Atalay, V. and Cetin-Atalay, R.},
  title = {Implicit motif distribution based hybrid computational kernel for
	sequence classification},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1429-1436},
  number = {8},
  month = {Apr},
  abstract = {M{OTIVATION}: {W}e designed a general computational kernel for classification
	problems that require specific motif extraction and search from sequences.
	{I}nstead of searching for explicit motifs, our approach finds the
	distribution of implicit motifs and uses as a feature for classification.
	{I}mplicit motif distribution approach may be used as modus operandi
	for bioinformatics problems that require specific motif extraction
	and search, which is otherwise computationally prohibitive. {RESULTS}:
	{A} system named {P}2{SL} that infer protein subcellular targeting
	was developed through this computational kernel. {T}argeting-signal
	was modeled by the distribution of subsequence occurrences (implicit
	motifs) using self-organizing maps. {T}he boundaries among the classes
	were then determined with a set of support vector machines. {P}2{SL}
	hybrid computational system achieved approximately 81\% of prediction
	accuracy rate over {ER} targeted, cytosolic, mitochondrial and nuclear
	protein localization classes. {P}2{SL} additionally offers the distribution
	potential of proteins among localization classes, which is particularly
	important for proteins, shuttle between nucleus and cytosol. {AVAILABILITY}:
	http://staff.vbi.vt.edu/volkan/p2sl and http://www.i-cancer.fen.bilkent.edu.tr/p2sl
	{CONTACT}: rengul@bilkent.edu.tr.},
  doi = {10.1093/bioinformatics/bti212},
  pdf = {../local/Atalay2005Implicit.pdf},
  file = {Atalay2005Implicit.pdf:local/Atalay2005Implicit.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti212}
}
@article{Bagirov2003New,
  author = {A. M. Bagirov and B. Ferguson and S. Ivkovic and G. Saunders and
	J. Yearwood},
  title = {New algorithms for multi-class cancer diagnosis using tumor gene
	expression signatures.},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1800-7},
  number = {14},
  month = {Sep},
  abstract = {M{OTIVATION}: {T}he increasing use of {DNA} microarray-based tumor
	gene expression profiles for cancer diagnosis requires mathematical
	methods with high accuracy for solving clustering, feature selection
	and classification problems of gene expression data. {RESULTS}: {N}ew
	algorithms are developed for solving clustering, feature selection
	and classification problems of gene expression data. {T}he clustering
	algorithm is based on optimization techniques and allows the calculation
	of clusters step-by-step. {T}his approach allows us to find as many
	clusters as a data set contains with respect to some tolerance. {F}eature
	selection is crucial for a gene expression database. {O}ur feature
	selection algorithm is based on calculating overlaps of different
	genes. {T}he database used, contains over 16 000 genes and this number
	is considerably reduced by feature selection. {W}e propose a classification
	algorithm where each tissue sample is considered as the center of
	a cluster which is a ball. {T}he results of numerical experiments
	confirm that the classification algorithm in combination with the
	feature selection algorithm perform slightly better than the published
	results for multi-class classifiers based on support vector machines
	for this data set. {AVAILABILITY}: {A}vailable on request from the
	authors.},
  pdf = {../local/Bagirov2003New.pdf},
  file = {Bagirov2003New.pdf:local/Bagirov2003New.pdf:PDF},
  keywords = {Algorithms, Amino Acid Sequence, Anion Exchange Resins, Antigen-Antibody
	Complex, Artificial Intelligence, Automated, Automatic Data Processing,
	Biological, Blood Cells, Chemical, Chromatography, Cluster Analysis,
	Comparative Study, Computational Biology, Computer Simulation, Computer-Assisted,
	DNA, Data Interpretation, Databases, Decision Making, Decision Trees,
	Diffusion Magnetic Resonance Imaging, English Abstract, Epitopes,
	Expert Systems, Factual, Fuzzy Logic, Gene Expression Profiling,
	Gene Expression Regulation, Gene Targeting, Genetic, Genome, Histocompatibility
	Antigens Class I, Humans, Image Interpretation, Image Processing,
	In Vitro, Indicators and Reagents, Information Storage and Retrieval,
	Ion Exchange, Least-Squares Analysis, Liver Cirrhosis, Magnetic Resonance
	Imaging, Male, Models, Molecular Sequence Data, Neoplasms, Neoplastic,
	Neural Networks (Computer), Non-P.H.S., Non-U.S. Gov't, Nonl, Nucleic
	Acid Conformation, Oligonucleotide Array Sequence Analysis, P.H.S.,
	Pattern Recognition, Pro, Prostatic Neoplasms, Protein, Protein Binding,
	Protein Interaction Mapping, Proteins, Quantitative Structure-Activity
	Relationship, RNA, ROC Curve, Reproducibility of Results, Research
	Support, Sensitivity and Specificity, Sequence Alignment, Sequence
	Analysis, Severity of Illness Index, Statistical, Structure-Activity
	Relationship, Subtraction Technique, T-Lymphocyte, Transcription
	Factors, Transfer, Treatment Outcome, Tumor Markers, U.S. Gov't,
	User-Computer Interface, inear Dynamics, teome, 14512351},
  url = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/19/14/1800}
}
@article{Baldi1999Exploiting,
  author = {Baldi, P. and Brunak, S. and Frasconi, P. and Soda, G. and Pollastri,
	G.},
  title = {Exploiting the past and the future in protein secondary structure
	prediction},
  journal = {Bioinformatics},
  year = {1999},
  volume = {15},
  pages = {937--946},
  pdf = {../local/bald99.pdf},
  file = {bald99.pdf:local/bald99.pdf:PDF},
  subject = {biocasp},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/15/11/937.pdf}
}
@article{Bao2005Identifying,
  author = {Lei Bao},
  title = {Identifying genes related to chemosensitivity using support vector
	machine.},
  journal = {Methods {M}ol {M}ed},
  year = {2005},
  volume = {111},
  pages = {233-40},
  abstract = {In an effort to identify genes involved in chemosensitivity and to
	evaluate the functional relationships between genes and anticancer
	drugs acting by the same mechanism, a supervised machine learning
	approach called support vector machine ({SVM}) is used to associate
	genes with any of five predefined anticancer drug mechanistic categories.
	{T}he drug activity profiles are used as training examples to train
	the {SVM} and then the gene expression profiles are used as test
	examples to predict their associated mechanistic categories. {T}his
	method of correlating drugs and genes provides a strategy for finding
	novel biologically significant relationships for molecular pharmacology.},
  keywords = {biosvm},
  pii = {1-59259-889-7:233}
}
@article{Bao2005Prediction,
  author = {Lei Bao and Yan Cui},
  title = {Prediction of the phenotypic effects of non-synonymous single nucleotide
	polymorphisms using structural and evolutionary information.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2185-90},
  number = {10},
  month = {May},
  abstract = {M{OTIVATION}: {T}here has been great expectation that the knowledge
	of an individual's genotype will provide a basis for assessing susceptibility
	to diseases and designing individualized therapy. {N}on-synonymous
	single nucleotide polymorphisms (ns{SNP}s) that lead to an amino
	acid change in the protein product are of particular interest because
	they account for nearly half of the known genetic variations related
	to human inherited diseases. {T}o facilitate the identification of
	disease-associated ns{SNP}s from a large number of neutral ns{SNP}s,
	it is important to develop computational tools to predict the phenotypic
	effects of ns{SNP}s. {RESULTS}: {W}e prepared a training set based
	on the variant phenotypic annotation of the {S}wiss-{P}rot database
	and focused our analysis on ns{SNP}s having homologous 3{D} structures.
	{S}tructural environment parameters derived from the 3{D} homologous
	structure as well as evolutionary information derived from the multiple
	sequence alignment were used as predictors. {T}wo machine learning
	methods, support vector machine and random forest, were trained and
	evaluated. {W}e compared the performance of our method with that
	of the {SIFT} algorithm, which is one of the best predictive methods
	to date. {A}n unbiased evaluation study shows that for ns{SNP}s with
	sufficient evolutionary information (with not <10 homologous sequences),
	the performance of our method is comparable with the {SIFT} algorithm,
	while for ns{SNP}s with insufficient evolutionary information (<10
	homologous sequences), our method outperforms the {SIFT} algorithm
	significantly. {T}hese findings indicate that incorporating structural
	information is critical to achieving good prediction accuracy when
	sufficient evolutionary information is not available. {AVAILABILITY}:
	{T}he codes and curated dataset are available at http://compbio.utmem.edu/snp/dataset/},
  doi = {10.1093/bioinformatics/bti365},
  pdf = {../local/Bao2005Prediction.pdf},
  file = {Bao2005Prediction.pdf:local/Bao2005Prediction.pdf:PDF},
  keywords = {biosvm},
  pii = {bti365},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti365}
}
@article{Bao2002Identifying,
  author = {Bao, L. and Sun, Z.},
  title = {Identifying genes related to drug anticancer mechanisms using support
	vector machine},
  journal = {F{EBS} {L}ett.},
  year = {2002},
  volume = {521},
  pages = {109--114},
  abstract = {In an effort to identify genes related to the cell line chemosensitivity
	and to evaluate the functional relationships between genes and anticancer
	drugs acting by the same mechanism, a supervised machine learning
	approach called support vector machine was used to label genes into
	any of the five predefined anticancer drug mechanistic categories.
	{A}mong dozens of unequivocally categorized genes, many were known
	to be causally related to the drug mechanisms. {F}or example, a few
	genes were found to be involved in the biological process triggered
	by the drugs (e.g. {DNA} polymerase epsilon was the direct target
	for the drugs from {DNA} antimetabolites category). {DNA} repair-related
	genes were found to be enriched for about eight-fold in the resulting
	gene set relative to the entire gene set. {S}ome uncharacterized
	transcripts might be of interest in future studies. {T}his method
	of correlating the drugs and genes provides a strategy for finding
	novel biologically significant relationships for molecular pharmacology.},
  pdf = {../local/bao02.pdf},
  file = {bao02.pdf:local/bao02.pdf:PDF},
  keywords = {biosvm microarray},
  subject = {biokernel},
  url = {http://www.elsevier.com/febs/402/19/42/article.html}
}
@article{Barabasi1999Emergence,
  author = {Barab{\'a}si, A.-L. and Albert, R.},
  title = {Emergence of scaling in random networks},
  journal = {Science},
  year = {1999},
  volume = {286},
  pages = {509--512},
  abstract = {Systems as diverse as genetic networks or the World Wide Web are best
	described as networks with complex topology. A common property of
	many large networks is that the vertex connectivities follow a scale-free
	power-law distribution. This feature was found to be a consequence
	of two generic mechanisms: (i) networks expand continuously by the
	addition of new vertices, and (ii) new vertices attach preferentially
	to sites that are already well connected. A model based on these
	two ingredients reproduces the observed stationary scale-free distributions,
	which indicates that the development of large networks is governed
	by robust self-organizing phenomena that go beyond the particulars
	of the individual systems.},
  pdf = {../local/Barabasi1999Emergence.pdf},
  file = {Barabasi1999Emergence.pdf:Barabasi1999Emergence.pdf:PDF},
  subject = {bionet},
  url = {http://www.sciencemag.org/cgi/reprint/286/5439/509.pdf}
}
@article{Baumgartner2004Supervised,
  author = {Baumgartner, C. and Bohm, C. and Baumgartner, D. and Marini, G. and
	Weinberger, K. and Olgemoller, B. and Liebl, B. and Roscher, A. A.},
  title = {Supervised machine learning techniques for the classification of
	metabolic disorders in newborns},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {2985-2996},
  number = {17},
  abstract = {Motivation: {D}uring the {B}avarian newborn screening programme all
	newborns have been tested for about 20 inherited metabolic disorders.
	{O}wing to the amount and complexity of the generated experimental
	data, machine learning techniques provide a promising approach to
	investigate novel patterns in high-dimensional metabolic data which
	form the source for constructing classification rules with high discriminatory
	power. {R}esults: {S}ix machine learning techniques have been investigated
	for their classification accuracy focusing on two metabolic disorders,
	phenylketo nuria ({PKU}) and medium-chain acyl-{C}o{A} dehydrogenase
	deficiency ({MCADD}). {L}ogistic regression analysis led to superior
	classification rules (sensitivity >96.8%, specificity >99.98%) compared
	to all investigated algorithms. {I}ncluding novel constellations
	of metabolites into the models, the positive predictive value could
	be strongly increased ({PKU} 71.9% versus 16.2%, {MCADD} 88.4% versus
	54.6% compared to the established diagnostic markers). {O}ur results
	clearly prove that the mined data confirm the known and indicate
	some novel metabolic patterns which may contribute to a better understanding
	of newborn metabolism. {A}vailability: {WEKA} machine learning package:
	www.cs.waikato.ac.nz/~ml/weka and statistical software package {ADE}-4:
	http://pbil.univ-lyon1.fr/{ADE}-4},
  doi = {10.1093/bioinformatics/bth343},
  pdf = {../local/Baumgartner2004Supervised.pdf},
  file = {Baumgartner2004Supervised.pdf:local/Baumgartner2004Supervised.pdf:PDF},
  keywords = {biosvm proteomics},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/2985}
}
@article{Bazzani2001SVM,
  author = {A. Bazzani and A. Bevilacqua and D. Bollini and R. Brancaccio and
	R. Campanini and N. Lanconelli and A. Riccardi and D. Romani},
  title = {An {SVM} classifier to separate false signals from microcalcifications
	in digital mammograms.},
  journal = {Phys {M}ed {B}iol},
  year = {2001},
  volume = {46},
  pages = {1651-63},
  number = {6},
  month = {Jun},
  abstract = {In this paper we investigate the feasibility of using an {SVM} (support
	vector machine) classifier in our automatic system for the detection
	of clustered microcalcifications in digital mammograms. {SVM} is
	a technique for pattern recognition which relies on the statistical
	learning theory. {I}t minimizes a function of two terms: the number
	of misclassified vectors of the training set and a term regarding
	the generalization classifier capability. {W}e compare the {SVM}
	classifier with an {MLP} (multi-layer perceptron) in the false-positive
	reduction phase of our detection scheme: a detected signal is considered
	either microcalcification or false signal, according to the value
	of a set of its features. {T}he {SVM} classifier gets slightly better
	results than the {MLP} one ({A}z value of 0.963 against 0.958) in
	the presence of a high number of training data; the improvement becomes
	much more evident ({A}z value of 0.952 against 0.918) in training
	sets of reduced size. {F}inally, the setting of the {SVM} classifier
	is much easier than the {MLP} one.},
  doi = {10.1088/0031-9155/46/6/305},
  pdf = {../local/Bazzani2001SVM.pdf},
  file = {Bazzani2001SVM.pdf:local/Bazzani2001SVM.pdf:PDF},
  keywords = {biosvm image},
  url = {http://dx.doi.org/10.1088/0031-9155/46/6/305}
}
@article{Beal2005Bayesian,
  author = {Beal, M. J. and Falciani, F. and Ghahramani, Z. and Rangel, C. and
	Wild, D. L.},
  title = {A {B}ayesian approach to reconstructing genetic regulatory networks
	with hidden factors.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {349--356},
  number = {3},
  month = {Feb},
  abstract = {M{OTIVATION}: {W}e have used state-space models ({SSM}s) to reverse
	engineer transcriptional networks from highly replicated gene expression
	profiling time series data obtained from a well-established model
	of {T} cell activation. {SSM}s are a class of dynamic {B}ayesian
	networks in which the observed measurements depend on some hidden
	state variables that evolve according to {M}arkovian dynamics. {T}hese
	hidden variables can capture effects that cannot be directly measured
	in a gene expression profiling experiment, for example: genes that
	have not been included in the microarray, levels of regulatory proteins,
	the effects of m{RNA} and protein degradation, etc. {RESULTS}: {W}e
	have approached the problem of inferring the model structure of these
	state-space models using both classical and {B}ayesian methods. {I}n
	our previous work, a bootstrap procedure was used to derive classical
	confidence intervals for parameters representing 'gene-gene' interactions
	over time. {I}n this article, variational approximations are used
	to perform the analogous model selection task in the {B}ayesian context.
	{C}ertain interactions are present in both the classical and the
	{B}ayesian analyses of these regulatory networks. {T}he resulting
	models place {J}un{B} and {J}un{D} at the centre of the mechanisms
	that control apoptosis and proliferation. {T}hese mechanisms are
	key for clonal expansion and for controlling the long term behavior
	(e.g. programmed cell death) of these cells. {AVAILABILITY}: {S}upplementary
	data is available at http://public.kgi.edu/wild/index.htm and {M}atlab
	source code for variational {B}ayesian learning of {SSM}s is available
	at http://www.cse.ebuffalo.edu/faculty/mbeal/software.html.},
  doi = {10.1093/bioinformatics/bti014},
  pdf = {../local/Beal2005Bayesian.pdf},
  file = {Beal2005Bayesian.pdf:local/Beal2005Bayesian.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {bti014},
  pmid = {15353451},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti014}
}
@article{Beerenwinkel2003Methods,
  author = {Beerenwinkel, N. and Lengauer, T. and Daumer, M. and Kaiser, R. and
	Walter, H. and Korn, K. and Hoffmann, D. and Selbig, J.},
  title = {Methods for optimizing antiviral combination therapies},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {i16-i25},
  number = {Suppl. 1},
  abstract = {Motivation: {D}espite some progress with antiretroviral combination
	therapies, therapeutic success in the management of {HIV}-infected
	patients is limited. {T}he evolution of drug-resistant genetic variants
	in response to therapy plays a key role in treatment failure and
	finding a new potent drug combination after therapy failure is considered
	challenging. {R}esults: {T}o estimate the activity of a drug combination
	against a particular viral strain, we develop a scoring function
	whose independent variables describe a set of antiviral agents and
	viral {DNA} sequences coding for the molecular targets of the respective
	drugs. {T}he construction of this activity score involves (1) predicting
	phenotypic drug resistance from genotypes for each drug individually,
	(2) probabilistic modeling of predicted resistance values and integration
	into a score for drug combinations, and (3) searching through the
	mutational neighborhood of the considered strain in order to estimate
	activity on nearby mutants. {F}or a clinical data set, we determine
	the optimal search depth and show that the scoring scheme is predictive
	of therapeutic outcome. {P}roperties of the activity score and applications
	are discussed. {C}ontact: beerenwinkel@mpi-sb.mpg.de {K}eywords:
	{HIV}, antiretroviral therapy, drug resistance, {SVM} regression,
	therapy optimization, sequence space search.},
  pdf = {../local/Beerenwinkel2003Methods.pdf},
  file = {Beerenwinkel2003Methods.pdf:local/Beerenwinkel2003Methods.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i16}
}
@article{Beerenwinkel2001Geno2pheno,
  author = {Beerenwinkel, N. and Schmidt, B. and Walter, H. and Kaiser, R. and
	Lengauer, T. and Hoffman, D. and Korn, K. and Selbig, J.},
  title = {{{G}eno2pheno: {I}nterpreting {G}enotypic {HIV} {D}rug {R}esistance
	{T}ests}},
  journal = {I{EEE} {I}ntelligent {S}ystems},
  year = {2001},
  volume = {6},
  pages = {35-41},
  number = {6},
  abstract = {Rapid accumulation of resistance mutations in the genome of the human
	immunodeficiency virus ({HIV}) plays a central role in drug treatment
	failure in infected patients. {T}he authors have developed geno2pheno,
	an intelligent system that uses the information encoded in the viral
	genomic sequence to predict resistance or susceptibility of the virus
	to 13 antiretroviral agents. {T}o predict phenotypic drug resistance
	from genotype, they applied two machine learning techniques: decision
	trees and linear support vector machines. {T}hese techniques performed
	learning on more than 400 genotype-phenotype pairs for each drug.
	{T}he authors compared the generalization performance of the two
	families of models in leave-one-out experiments. {E}xcept for three
	drugs, all error estimates ranged between 7.25 and 15.5 percent.
	{S}upport vector machines performed slightly better for most drugs,
	but knowledge extraction was easier for decision trees. {G}eno2pheno
	is freely available at http://cartan.gmd.de/geno2pheno.html.},
  doi = {10.1109/5254.972080},
  pdf = {../local/Beerenwinkel2001Geno2pheno.pdf},
  file = {Beerenwinkel2001Geno2pheno.pdf:local/Beerenwinkel2001Geno2pheno.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1109/5254.972080}
}
@article{Ben-Dor2000Tissue,
  author = {Ben-Dor, A. and Bruhn, L. and Friedman, N. and Nachman, I. and Schummer,
	M. and Yakhini, Z.},
  title = {Tissue Classification with Gene Expression Profiles},
  journal = {J. Comput. Biol.},
  year = {2000},
  volume = {7},
  pages = {559-583},
  number = {3-4},
  abstract = {Constantly improving gene expression profiling technologies are expected
	to provide understanding and insight into cancer-related cellular
	processes. {G}ene expression data is also expected to significantly
	aid in the development of efficient cancer diagnosis and classification
	platforms. {I}n this work we examine three sets of gene expression
	data measured across sets of tumor(s) and normal clinical samples:
	{T}he first set consists of 2,000 genes, measured in 62 epithelial
	colon samples ({A}lon et al., 1999). {T}he second consists of approximately
	equal to 100,000 clones, measured in 32 ovarian samples (unpublished
	extension of data set described in {S}chummer et al. (1999)). {T}he
	third set consists of approximately equal to 7,100 genes, measured
	in 72 bone marrow and peripheral blood samples ({G}olub et al, 1999).
	{W}e examine the use of scoring methods, measuring separation of
	tissue type (e.g., tumors from normals) using individual gene expression
	levels. {T}hese are then coupled with high-dimensional classification
	methods to assess the classification power of complete expression
	profiles. {W}e present results of performing leave-one-out cross
	validation ({LOOCV}) experiments on the three data sets, employing
	nearest neighbor classifier, {SVM} ({C}ortes and {V}apnik, 1995),
	{A}da{B}oost ({F}reund and {S}chapire, 1997) and a novel clustering-based
	classification technique. {A}s tumor samples can differ from normal
	samples in their cell-type composition, we also perform {LOOCV} experiments
	using appropriately modified sets of genes, attempting to eliminate
	the resulting bias. {W}e demonstrate success rate of at least 90%
	in tumor versus normal classification, using sets of selected genes,
	with, as well as without, cellular-contamination-related members.
	{T}hese results are insensitive to the exact selection mechanism,
	over a certain range.},
  pdf = {../local/Ben-Dor2000Tissue.pdf},
  file = {Ben-Dor2000Tissue.pdf:local/Ben-Dor2000Tissue.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://www.liebertonline.com/doi/abs/10.1089/106652700750050943}
}
@article{Ben-Hur2003Remote,
  author = {Ben-Hur, A. and Brutlag, D.},
  title = {Remote homology detection: a motif based approach},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {i26-i33},
  number = {Suppl. 1},
  abstract = {Motivation: {R}emote homology detection is the problem of detecting
	homology in cases of low sequence similarity. {I}t is a hard computational
	problem with no approach that works well in all cases. {R}esults:
	{W}e present a method for detecting remote homology that is based
	on the presence of discrete sequence motifs. {T}he motif content
	of a pair of sequences is used to define a similarity that is used
	as a kernel for a {S}upport {V}ector {M}achine ({SVM}) classifier.
	{W}e test the method on two remote homology detection tasks: prediction
	of a previously unseen {SCOP} family and prediction of an enzyme
	class given other enzymes that have a similar function on other substrates.
	{W}e find that it performs significantly better than an {SVM} method
	that uses {BLAST} or {S}mith-{W}aterman similarity scores as features.
	{A}vailability: {T}he software is available from the authors upon
	request.},
  pdf = {../local/Ben-Hur2003Remote.pdf},
  file = {Ben-Hur2003Remote.pdf:local/Ben-Hur2003Remote.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i26}
}
@article{Ben-Hur2005Kernel,
  author = {Ben-Hur, A. and Noble, W. S.},
  title = {Kernel methods for predicting protein-protein interactions.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i38-i46},
  number = {Suppl. 1},
  month = {Jun},
  abstract = {M{OTIVATION}: {D}espite advances in high-throughput methods for discovering
	protein-protein interactions, the interaction networks of even well-studied
	model organisms are sketchy at best, highlighting the continued need
	for computational methods to help direct experimentalists in the
	search for novel interactions. {RESULTS}: {W}e present a kernel method
	for predicting protein-protein interactions using a combination of
	data sources, including protein sequences, {G}ene {O}ntology annotations,
	local properties of the network, and homologous interactions in other
	species. {W}hereas protein kernels proposed in the literature provide
	a similarity between single proteins, prediction of interactions
	requires a kernel between pairs of proteins. {W}e propose a pairwise
	kernel that converts a kernel between single proteins into a kernel
	between pairs of proteins, and we illustrate the kernel's effectiveness
	in conjunction with a support vector machine classifier. {F}urthermore,
	we obtain improved performance by combining several sequence-based
	kernels based on k-mer frequency, motif and domain content and by
	further augmenting the pairwise sequence kernel with features that
	are based on other sources of data.{W}e apply our method to predict
	physical interactions in yeast using data from the {BIND} database.
	{A}t a false positive rate of 1\% the classifier retrieves close
	to 80\% of a set of trusted interactions. {W}e thus demonstrate the
	ability of our method to make accurate predictions despite the sizeable
	fraction of false positives that are known to exist in interaction
	databases. {AVAILABILITY}: {T}he classification experiments were
	performed using {P}y{ML} available at http://pyml.sourceforge.net.
	{D}ata are available at: http://noble.gs.washington.edu/proj/sppi
	{CONTACT}: asa@gs.washington.edu.},
  doi = {10.1093/bioinformatics/bti1016},
  pdf = {../local/Ben-Hur2005Kernel.pdf},
  file = {Ben-Hur2005Kernel.pdf:local/Ben-Hur2005Kernel.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i38},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1016}
}
@article{Bern2004Automatic,
  author = {Bern, M. and Goldberg, D. and McDonald, W. H. and Yates, J. R., III},
  title = {Automatic {Q}uality {A}ssessment of {P}eptide {T}andem {M}ass {S}pectra},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {i49-i54},
  number = {Suppl. 1},
  abstract = {Motivation: {A} powerful proteomics methodology couples high-performance
	liquid chromatography ({HPLC}) with tandem mass spectrometry and
	database-search software, such as {SEQUEST}. {S}uch a set-up, however,
	produces a large number of spectra, many of which are of too poor
	quality to be useful. {H}ence a filter that eliminates poor spectra
	before the database search can significantly improve throughput and
	robustness. {M}oreover, spectra judged to be of high quality, but
	that cannot be identified by database search, are prime candidates
	for still more computationally intensive methods, such as de novo
	sequencing or wider database searches including post-translational
	modifications. {R}esults: {W}e report on two different approaches
	to assessing spectral quality prior to identification: binary classification,
	which predicts whether or not {SEQUEST} will be able to make an identification,
	and statistical regression, which predicts a more universal quality
	metric involving the number of b- and y-ion peaks. {T}he best of
	our binary classifiers can eliminate over 75% of the unidentifiable
	spectra while losing only 10% of the identifiable spectra. {S}tatistical
	regression can pick out spectra of modified peptides that can be
	identified by a de novo program but not by {SEQUEST}. {I}n a section
	of independent interest, we discuss intensity normalization of mass
	spectra.},
  pdf = {../local/Bern2004Automatic.pdf},
  file = {Bern2004Automatic.pdf:local/Bern2004Automatic.pdf:PDF},
  keywords = {biosvm proteomics},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i49}
}
@article{Bernardo2005Chemogenomica,
  author = {di Bernardo, D. and Thompson, M.J. and Gardner, T.S. and Chobot,
	S.E. and Eastwood, E.L. and Wojtovich, A.P. and Elliott, S.J. and
	Schaus, S.E. and Collins, J.J.},
  title = {Chemogenomic profiling on a genome-wide scale using reverse-engineered
	gene networks.},
  journal = {Nat Biotechnol},
  year = {2005},
  volume = {23},
  pages = {377--383},
  number = {3},
  month = {Mar},
  abstract = {A major challenge in drug discovery is to distinguish the molecular
	targets of a bioactive compound from the hundreds to thousands of
	additional gene products that respond indirectly to changes in the
	activity of the targets. Here, we present an integrated computational-experimental
	approach for computing the likelihood that gene products and associated
	pathways are targets of a compound. This is achieved by filtering
	the mRNA expression profile of compound-exposed cells using a reverse-engineered
	model of the cell's gene regulatory network. We apply the method
	to a set of 515 whole-genome yeast expression profiles resulting
	from a variety of treatments (compounds, knockouts and induced expression),
	and correctly enrich for the known targets and associated pathways
	in the majority of compounds examined. We demonstrate our approach
	with PTSB, a growth inhibitory compound with a previously unknown
	mode of action, by predicting and validating thioredoxin and thioredoxin
	reductase as its target.},
  doi = {10.1038/nbt1075},
  institution = {Telethon Institute for Genetics and Medicine, Naples, Italy.},
  keywords = {Algorithms; Artificial Intelligence; Computer Simulation; Drug Delivery
	Systems; Drug Design; Gene Expression Profiling; Gene Expression
	Regulation; Models, Biological; Models, Statistical; Protein Engineering;
	Protein Interaction Mapping; Saccharomyces cerevisiae; Saccharomyces
	cerevisiae Proteins; Signal Transduction; Thioredoxin-Disulfide Reductase;
	Thioredoxins},
  owner = {fantine},
  pii = {nbt1075},
  pmid = {15765094},
  timestamp = {2010.10.21},
  url = {http://dx.doi.org/10.1038/nbt1075}
}
@article{Bhasin2005GPCRsclass,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {G{PCR}sclass: a web tool for the classification of amine type of
	{G}-protein-coupled receptors.},
  journal = {Nucleic {A}cids {R}es.},
  year = {2005},
  volume = {33},
  pages = {W143-7},
  number = {Web Server issue},
  month = {Jul},
  abstract = {The receptors of amine subfamily are specifically major drug targets
	for therapy of nervous disorders and psychiatric diseases. {T}he
	recognition of novel amine type of receptors and their cognate ligands
	is of paramount interest for pharmaceutical companies. {I}n the past,
	{C}hou and co-workers have shown that different types of amine receptors
	are correlated with their amino acid composition and are predictable
	on its basis with considerable accuracy [{E}lrod and {C}hou (2002)
	{P}rotein {E}ng., 15, 713-715]. {T}his motivated us to develop a
	better method for the recognition of novel amine receptors and for
	their further classification. {T}he method was developed on the basis
	of amino acid composition and dipeptide composition of proteins using
	support vector machine. {T}he method was trained and tested on 167
	proteins of amine subfamily of {G}-protein-coupled receptors ({GPCR}s).
	{T}he method discriminated amine subfamily of {GPCR}s from globular
	proteins with {M}atthew's correlation coefficient of 0.98 and 0.99
	using amino acid composition and dipeptide composition, respectively.
	{I}n classifying different types of amine receptors using amino acid
	composition and dipeptide composition, the method achieved an accuracy
	of 89.8 and 96.4\%, respectively. {T}he performance of the method
	was evaluated using 5-fold cross-validation. {T}he dipeptide composition
	based method predicted 67.6\% of protein sequences with an accuracy
	of 100\% with a reliability index > or =5. {A} web server {GPCR}sclass
	has been developed for predicting amine-binding receptors from its
	amino acid sequence [http://www.imtech.res.in/raghava/gpcrsclass/
	and http://bioinformatics.uams.edu/raghava/gpersclass/ (mirror site)].},
  doi = {10.1093/nar/gki351},
  pdf = {../local/Bhasin2005GPCRsclass.pdf},
  file = {Bhasin2005GPCRsclass.pdf:local/Bhasin2005GPCRsclass.pdf:PDF},
  keywords = {biosvm},
  pii = {33/suppl_2/W143},
  url = {http://dx.doi.org/10.1093/nar/gki351}
}
@article{Bhasin2005Pcleavage,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {Pcleavage: an {SVM} based method for prediction of constitutive proteasome
	and immunoproteasome cleavage sites in antigenic sequences.},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {W202-7},
  number = {Web Server issue},
  month = {Jul},
  abstract = {This manuscript describes a support vector machine based method for
	the prediction of constitutive as well as immunoproteasome cleavage
	sites in antigenic sequences. {T}his method achieved {M}atthew's
	correlation coefficents of 0.54 and 0.43 on in vitro and major histocompatibility
	complex ligand data, respectively. {T}his shows that the performance
	of our method is comparable to that of the {N}et{C}hop method, which
	is currently considered to be the best method for proteasome cleavage
	site prediction. {B}ased on the method, a web server, {P}cleavage,
	has also been developed. {T}his server accepts protein sequences
	in any standard format and present results in a user-friendly format.
	{T}he server is available for free use by all academic users at the
	{URL} http://www.imtech.res.in/raghava/pcleavage/ or http://bioinformatics.uams.edu/mirror/pcleavage/.},
  doi = {10.1093/nar/gki587},
  pdf = {../local/Bhasin2005Pcleavage.pdf},
  file = {Bhasin2005Pcleavage.pdf:local/Bhasin2005Pcleavage.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  url = {http://dx.doi.org/10.1093/nar/gki587}
}
@article{Bhasin2004Analysis,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {Analysis and prediction of affinity of {TAP} binding peptides using
	cascade {SVM}},
  journal = {Protein {S}ci.},
  year = {2004},
  volume = {13},
  pages = {596-607},
  number = {3},
  month = {Mar},
  abstract = {The generation of cytotoxic {T} lymphocyte ({CTL}) epitopes from an
	antigenic sequence involves number of intracellular processes, including
	production of peptide fragments by proteasome and transport of peptides
	to endoplasmic reticulum through transporter associated with antigen
	processing ({TAP}). {I}n this study, 409 peptides that bind to human
	{TAP} transporter with varying affinity were analyzed to explore
	the selectivity and specificity of {TAP} transporter. {T}he abundance
	of each amino acid from {P}1 to {P}9 positions in high-, intermediate-,
	and low-affinity {TAP} binders were examined. {T}he rules for predicting
	{TAP} binding regions in an antigenic sequence were derived from
	the above analysis. {T}he quantitative matrix was generated on the
	basis of contribution of each position and residue in binding affinity.
	{T}he correlation of r = 0.65 was obtained between experimentally
	determined and predicted binding affinity by using a quantitative
	matrix. {F}urther a support vector machine ({SVM})-based method has
	been developed to model the {TAP} binding affinity of peptides. {T}he
	correlation (r = 0.80) was obtained between the predicted and experimental
	measured values by using sequence-based {SVM}. {T}he reliability
	of prediction was further improved by cascade {SVM} that uses features
	of amino acids along with sequence. {A}n extremely good correlation
	(r = 0.88) was obtained between measured and predicted values, when
	the cascade {SVM}-based method was evaluated through jackknife testing.
	{A} {W}eb service, {TAPP}red (http://www.imtech.res.in/raghava/tappred/
	or http://bioinformatics.uams.edu/mirror/tappred/), has been developed
	based on this approach.},
  doi = {10.1110/ps.03373104},
  pdf = {../local/Bhasin2004Analysis.pdf},
  file = {Bhasin2004Analysis.pdf:local/Bhasin2004Analysis.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1110/ps.03373104}
}
@article{Bhasin2004Classification,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {Classification of {N}uclear {R}eceptors {B}ased on {A}mino {A}cid
	{C}omposition and {D}ipeptide {C}omposition},
  journal = {J. {B}iol. {C}hem.},
  year = {2004},
  volume = {279},
  pages = {23262-23266},
  number = {22},
  abstract = {Nuclear receptors are key transcription factors that regulate crucial
	gene networks responsible for cell growth, differentiation, and homeostasis.
	{N}uclear receptors form a superfamily of phylogenetically related
	proteins and control functions associated with major diseases (e.g.
	diabetes, osteoporosis, and cancer). {I}n this study, a novel method
	has been developed for classifying the subfamilies of nuclear receptors.
	{T}he classification was achieved on the basis of amino acid and
	dipeptide composition from a sequence of receptors using support
	vector machines. {T}he training and testing was done on a non-redundant
	data set of 282 proteins obtained from the {N}uclea{RDB} data base
	(1). {T}he performance of all classifiers was evaluated using a 5-fold
	cross validation test. {I}n the 5-fold cross-validation, the data
	set was randomly partitioned into five equal sets and evaluated five
	times on each distinct set while keeping the remaining four sets
	for training. {I}t was found that different subfamilies of nuclear
	receptors were quite closely correlated in terms of amino acid composition
	as well as dipeptide composition. {T}he overall accuracy of amino
	acid composition-based and dipeptide compositionbased classifiers
	were 82.6 and 97.5%, respectively. {T}herefore, our results prove
	that different subfamilies of nuclear receptors are predictable with
	considerable accuracy using amino acid or dipeptide composition.
	{F}urthermore, based on above approach, an online web service, {NR}pred,
	was developed, which is available at www.imtech.res.in/raghava/nrpred.},
  doi = {10.1074/jbc.M401932200},
  eprint = {http://www.jbc.org/cgi/reprint/279/22/23262.pdf},
  pdf = {../local/Bhasin2004Classification.pdf},
  file = {Bhasin2004Classification.pdf:local/Bhasin2004Classification.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1074/jbc.M401932200}
}
@article{Bhasin2004ESLpred,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {{{ESL}pred: {SVM}}-based method for subcellular localization of eukaryotic
	proteins using dipeptide composition and {{PSI}-{BLAST}}},
  journal = {Nucl. {A}cids {R}es.},
  year = {2004},
  volume = {32},
  pages = {W414-419},
  number = {Suppl. 2},
  abstract = {Automated prediction of subcellular localization of proteins is an
	important step in the functional annotation of genomes. {T}he existing
	subcellular localization prediction methods are based on either amino
	acid composition or {N}-terminal characteristics of the proteins.
	{I}n this paper, support vector machine ({SVM}) has been used to
	predict the subcellular location of eukaryotic proteins from their
	different features such as amino acid composition, dipeptide composition
	and physico-chemical properties. {T}he {SVM} module based on dipeptide
	composition performed better than the {SVM} modules based on amino
	acid composition or physico-chemical properties. {I}n addition, {PSI}-{BLAST}
	was also used to search the query sequence against the dataset of
	proteins (experimentally annotated proteins) to predict its subcellular
	location. {I}n order to improve the prediction accuracy, we developed
	a hybrid module using all features of a protein, which consisted
	of an input vector of 458 dimensions (400 dipeptide compositions,
	33 properties, 20 amino acid compositions of the protein and 5 from
	{PSI}-{BLAST} output). {U}sing this hybrid approach, the prediction
	accuracies of nuclear, cytoplasmic, mitochondrial and extracellular
	proteins reached 95.3, 85.2, 68.2 and 88.9%, respectively. {T}he
	overall prediction accuracy of {SVM} modules based on amino acid
	composition, physico-chemical properties, dipeptide composition and
	the hybrid approach was 78.1, 77.8, 82.9 and 88.0%, respectively.
	{T}he accuracy of all the modules was evaluated using a 5-fold cross-validation
	technique. {A}ssigning a reliability index (reliability index > or
	=3), 73.5% of prediction can be made with an accuracy of 96.4%. {B}ased
	on the above approach, an online web server {ESL}pred was developed,
	which is available at http://www.imtech.res.in/raghava/eslpred/.},
  doi = {10.1093/nar/gkh350},
  eprint = {http://nar.oupjournals.org/cgi/reprint/32/suppl_2/W414.pdf},
  pdf = {../local/Bhasin2004ESLpred.pdf},
  file = {Bhasin2004ESLpred.pdf:local/Bhasin2004ESLpred.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://nar.oupjournals.org/cgi/content/abstract/32/suppl_2/W414}
}
@article{Bhasin2004GPCRpred,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {{{GPCR}pred}: an {SVM}-based method for prediction of families and
	subfamilies of {G}-protein coupled receptors},
  journal = {Nucl. {A}cids {R}es.},
  year = {2004},
  volume = {32},
  pages = {W383-389},
  number = {Supp.2},
  abstract = {G-protein coupled receptors ({GPCR}s) belong to one of the largest
	superfamilies of membrane proteins and are important targets for
	drug design. {I}n this study, a support vector machine ({SVM})-based
	method, {GPCR}pred, has been developed for predicting families and
	subfamilies of {GPCR}s from the dipeptide composition of proteins.
	{T}he dataset used in this study for training and testing was obtained
	from http://www.soe.ucsc.edu/research/compbio/gpcr/. {T}he method
	classified {GPCR}s and non-{GPCR}s with an accuracy of 99.5% when
	evaluated using 5-fold cross-validation. {T}he method is further
	able to predict five major classes or families of {GPCR}s with an
	overall {M}atthew's correlation coefficient ({MCC}) and accuracy
	of 0.81 and 97.5% respectively. {I}n recognizing the subfamilies
	of the rhodopsin-like family, the method achieved an average {MCC}
	and accuracy of 0.97 and 97.3% respectively. {T}he method achieved
	overall accuracy of 91.3% and 96.4% at family and subfamily level
	respectively when evaluated on an independent/blind dataset of 650
	{GPCR}s. {A} server for recognition and classification of {GPCR}s
	based on multiclass {SVM}s has been set up at http://www.imtech.res.in/raghava/gpcrpred/.
	{W}e have also suggested subfamilies for 42 sequences which were
	previously identified as unclassified {C}lass{A} {GPCR}s. {T}he supplementary
	information is available at http://www.imtech.res.in/raghava/gpcrpred/info.html.},
  doi = {10.1093/nar/gkh416},
  eprint = {http://nar.oupjournals.org/cgi/reprint/32/suppl_2/W383.pdf},
  pdf = {../local/Bhasin2004GPCRpred.pdf},
  file = {Bhasin2004GPCRpred.pdf:local/Bhasin2004GPCRpred.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/nar/gkh416}
}
@article{Bhasin2004Prediction,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {Prediction of {CTL} epitopes using {QM}, {SVM} and {ANN} techniques},
  journal = {Vaccine},
  year = {2004},
  volume = {22},
  pages = {3195-3204},
  number = {23-24},
  abstract = {Cytotoxic {T} lymphocyte ({CTL}) epitopes are potential candidates
	for subunit vaccine design for various diseases. {M}ost of the existing
	{T} cell epitope prediction methods are indirect methods that predict
	{MHC} class {I} binders instead of {CTL} epitopes. {I}n this study,
	a systematic attempt has been made to develop a direct method for
	predicting {CTL} epitopes from an antigenic sequence. {T}his method
	is based on quantitative matrix ({QM}) and machine learning techniques
	such as {S}upport {V}ector {M}achine ({SVM}) and {A}rtificial {N}eural
	{N}etwork ({ANN}). {T}his method has been trained and tested on non-redundant
	dataset of {T} cell epitopes and non-epitopes that includes 1137
	experimentally proven {MHC} class {I} restricted {T} cell epitopes.
	{T}he accuracy of {QM}-, {ANN}- and {SVM}-based methods was 70.0,
	72.2 and 75.2%, respectively. {T}he performance of these methods
	has been evaluated through {L}eave {O}ne {O}ut {C}ross-{V}alidation
	({LOOCV}) at a cutoff score where sensitivity and specificity was
	nearly equal. {F}inally, both machine-learning methods were used
	for consensus and combined prediction of {CTL} epitopes. {T}he performances
	of these methods were evaluated on blind dataset where machine learning-based
	methods perform better than {QM}-based method. {W}e also demonstrated
	through subgroup analysis that our methods can discriminate between
	{T}-cell epitopes and {MHC} binders (non-epitopes). {I}n brief this
	method allows prediction of {CTL} epitopes using {QM}, {SVM}, {ANN}
	approaches. {T}he method also facilitates prediction of {MHC} restriction
	in predicted {T} cell epitopes.},
  doi = {10.1016/j.vaccine.2004.02.005},
  pdf = {../local/Bhasin2004Prediction.pdf},
  file = {Bhasin2004Prediction.pdf:local/Bhasin2004Prediction.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.vaccine.2004.02.005}
}
@article{Bhasin2004SVM,
  author = {Bhasin, M. and Raghava, G. P. S.},
  title = {S{VM} based method for predicting {{HLA}-{DRB}1*0401} binding peptides
	in an antigen sequence},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {421-423},
  number = {3},
  abstract = {Summary: {P}rediction of peptides binding with {MHC} class {II} allele
	{HLA}-{DRB}1*0401 can effectively reduce the number of experiments
	required for identifying helper {T} cell epitopes. {T}his paper describes
	support vector machine ({SVM}) based method developed for identifying
	{HLA}-{DRB}1*0401 binding peptides in an antigenic sequence. {SVM}
	was trained and tested on large and clean data set consisting of
	567 binders and equal number of non-binders. {T}he accuracy of the
	method was 86% when evaluated through 5-fold cross-validation technique.
	{A}vailable: {A} web server {HLA}-{DR}4{P}red based on above approach
	is available at http://www.imtech.res.in/raghava/hladr4pred/ and
	http://bioinformatics.uams.edu/mirror/hladr4pred/ ({M}irror {S}ite).
	{S}upplementary information: http://www.imtech.res.in/raghava/hladr4pred/info.html},
  pdf = {../local/Bhasin2004SVM.pdf},
  file = {Bhasin2004SVM.pdf:local/Bhasin2004SVM.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/3/421}
}
@article{Bhavani2006Substructure-based,
  author = {S. Bhavani and A. Nagargadde and A. Thawani and V. Sridhar and N.
	Chandra},
  title = {Substructure-based support vector machine classifiers for prediction
	of adverse effects in diverse classes of drugs.},
  journal = {J. Chem. Inform. Model.},
  year = {2006},
  volume = {46},
  pages = {2478--2486},
  number = {6},
  abstract = {Unforeseen adverse effects exhibited by drugs contribute heavily to
	late-phase failure and even withdrawal of marketed drugs. Torsade
	de pointes (TdP) is one such important adverse effect, which causes
	cardiac arrhythmia and, in some cases, sudden death, making it crucial
	for potential drugs to be screened for torsadogenicity. The need
	to tap the power of computational approaches for the prediction of
	adverse effects such as TdP is increasingly becoming evident. The
	availability of screening data including those in organized databases
	greatly facilitates exploration of newer computational approaches.
	In this paper, we report the development of a prediction method based
	on a support machine vector algorithm. The method uses a combination
	of descriptors, encoding both the type of toxicophore as well as
	the position of the toxicophore in the drug molecule, thus considering
	both the pharmacophore and the three-dimensional shape information
	of the molecule. For delineating toxicophores, a novel pattern-recognition
	method that utilizes substructures within a molecule has been developed.
	The results obtained using the hybrid approach have been compared
	with those available in the literature for the same data set. An
	improvement in prediction accuracy is clearly seen, with the accuracy
	reaching up to 97\% in predicting compounds that can cause TdP and
	90\% for predicting compounds that do not cause TdP. The generic
	nature of the method has been demonstrated with four data sets available
	for carcinogenicity, where prediction accuracies were significantly
	higher, with a best receiver operating characteristics (ROC) value
	of 0.81 as against a best ROC value of 0.7 reported in the literature
	for the same data set. Thus, the method holds promise for wide applicability
	in toxicity prediction.},
  doi = {10.1021/ci060128l},
  keywords = {Algorithms; Carcinogens; Chemistry, Pharmaceutical; Computational
	Biology; Drug Evaluation, Preclinical; Drug Industry; Humans; Models,
	Chemical; Models, Statistical; Neural Networks (Computer); Pattern
	Recognition, Automated; ROC Curve; Sequence Analysis, Protein; Software;
	Torsades de Pointes},
  owner = {laurent},
  pmid = {17125188},
  timestamp = {2007.09.18},
  url = {http://dx.doi.org/10.1021/ci060128l}
}
@article{Blows2010Subtyping,
  author = {Fiona M Blows and Kristy E Driver and Marjanka K Schmidt and Annegien
	Broeks and Flora E van Leeuwen and Jelle Wesseling and Maggie C Cheang
	and Karen Gelmon and Torsten O Nielsen and Carl Blomqvist and Päivi
	Heikkilä and Tuomas Heikkinen and Heli Nevanlinna and Lars A Akslen
	and Louis R Bégin and William D Foulkes and Fergus J Couch and Xianshu
	Wang and Vicky Cafourek and Janet E Olson and Laura Baglietto and
	Graham G Giles and Gianluca Severi and Catriona A McLean and Melissa
	C Southey and Emad Rakha and Andrew R Green and Ian O Ellis and Mark
	E Sherman and Jolanta Lissowska and William F Anderson and Angela
	Cox and Simon S Cross and Malcolm W R Reed and Elena Provenzano and
	Sarah-Jane Dawson and Alison M Dunning and Manjeet Humphreys and
	Douglas F Easton and Montserrat García-Closas and Carlos Caldas and
	Paul D Pharoah and David Huntsman},
  title = {Subtyping of breast cancer by immunohistochemistry to investigate
	a relationship between subtype and short and long term survival:
	a collaborative analysis of data for 10,159 cases from 12 studies.},
  journal = {PLoS Med},
  year = {2010},
  volume = {7},
  pages = {e1000279},
  number = {5},
  month = {May},
  abstract = {Immunohistochemical markers are often used to classify breast cancer
	into subtypes that are biologically distinct and behave differently.
	The aim of this study was to estimate mortality for patients with
	the major subtypes of breast cancer as classified using five immunohistochemical
	markers, to investigate patterns of mortality over time, and to test
	for heterogeneity by subtype.We pooled data from more than 10,000
	cases of invasive breast cancer from 12 studies that had collected
	information on hormone receptor status, human epidermal growth factor
	receptor-2 (HER2) status, and at least one basal marker (cytokeratin
	[CK]5/6 or epidermal growth factor receptor [EGFR]) together with
	survival time data. Tumours were classified as luminal and nonluminal
	tumours according to hormone receptor expression. These two groups
	were further subdivided according to expression of HER2, and finally,
	the luminal and nonluminal HER2-negative tumours were categorised
	according to expression of basal markers. Changes in mortality rates
	over time differed by subtype. In women with luminal HER2-negative
	subtypes, mortality rates were constant over time, whereas mortality
	rates associated with the luminal HER2-positive and nonluminal subtypes
	tended to peak within 5 y of diagnosis and then decline over time.
	In the first 5 y after diagnosis the nonluminal tumours were associated
	with a poorer prognosis, but over longer follow-up times the prognosis
	was poorer in the luminal subtypes, with the worst prognosis at 15
	y being in the luminal HER2-positive tumours. Basal marker expression
	distinguished the HER2-negative luminal and nonluminal tumours into
	different subtypes. These patterns were independent of any systemic
	adjuvant therapy.The six subtypes of breast cancer defined by expression
	of five markers show distinct behaviours with important differences
	in short term and long term prognosis. Application of these markers
	in the clinical setting could have the potential to improve the targeting
	of adjuvant chemotherapy to those most likely to benefit. The different
	patterns of mortality over time also suggest important biological
	differences between the subtypes that may result in differences in
	response to specific therapies, and that stratification of breast
	cancers by clinically relevant subtypes in clinical trials is urgently
	required.},
  doi = {10.1371/journal.pmed.1000279},
  pdf = {../local/Blows2010Subtyping.pdf},
  file = {Blows2010Subtyping.pdf:Blows2010Subtyping.pdf:PDF},
  institution = {Department of Oncology, University of Cambridge, United Kingdom.},
  keywords = {Adult; Aged; Aged, 80 and over; Breast Neoplasms, metabolism/mortality/pathology;
	Female; Hormones, analysis; Humans; Immunohistochemistry; Keratins;
	Middle Aged; Prognosis; Proportional Hazards Models; Receptor, Epidermal
	Growth Factor, analysis; Receptors, Cell Surface, metabolism; Tumor
	Markers, Biological, analysis; Young Adult},
  language = {eng},
  medline-pst = {epublish},
  owner = {phupe},
  pmid = {20520800},
  timestamp = {2011.06.01},
  url = {http://dx.doi.org/10.1371/journal.pmed.1000279}
}
@article{Bock2003Whole-proteome,
  author = {Bock, J. R. and Gough, D. A.},
  title = {Whole-proteome interaction mining},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {125-134},
  number = {1},
  abstract = {Motivation: {A} major post-genomic scientific and technological pursuit
	is to describe the functions performed by the proteins encoded by
	the genome. {O}ne strategy is to first identify the protein-protein
	interactions in a proteome, then determine pathways and overall structure
	relating these interactions, and finally to statistically infer functional
	roles of individual proteins. {A}lthough huge amounts of genomic
	data are at hand, current experimental protein interaction assays
	must overcome technical problems to scale-up for high-throughput
	analysis. {I}n the meantime, bioinformatics approaches may help bridge
	the information gap required for inference of protein function. {I}n
	this paper, a previously described data mining approach to prediction
	of protein-protein interactions ({B}ock and {G}ough, 2001, {B}ioinformatics,
	17, 455-460) is extended to interaction mining on a proteome-wide
	scale. {A}n algorithm (the phylogenetic bootstrap) is introduced,
	which suggests traversal of a phenogram, interleaving rounds of computation
	and experiment, to develop a knowledge base of protein interactions
	in genetically-similar organisms. {R}esults: {T}he interaction mining
	approach was demonstrated by building a learning system based on
	1,039 experimentally validated protein-protein interactions in the
	human gastric bacterium {H}elicobacter pylori. {A}n estimate of the
	generalization performance of the classifier was derived from 10-fold
	cross-validation, which indicated expected upper bounds on precision
	of 80% and sensitivity of 69% when applied to related organisms.
	{O}ne such organism is the enteric pathogen {C}ampylobacter jejuni,
	in which comprehensive machine learning prediction of all possible
	pairwise protein-protein interactions was performed. {T}he resulting
	network of interactions shares an average protein connectivity characteristic
	in common with previous investigations reported in the literature,
	offering strong evidence supporting the biological feasibility of
	the hypothesized map. {F}or inferences about complete proteomes in
	which the number of pairwise non-interactions is expected to be much
	larger than the number of actual interactions, we anticipate that
	the sensitivity will remain the same but precision may decrease.
	{W}e present specific biological examples of two subnetworks of protein-protein
	interactions in {C}. jejuni resulting from the application of this
	approach, including elements of a two-component signal transduction
	systems for thermoregulation, and a ferritin uptake network. {C}ontact:
	dgough@bioeng.ucsd.edu},
  pdf = {../local/Bock2003Whole-proteome.pdf},
  file = {Bock2003Whole-proteome.pdf:local/Bock2003Whole-proteome.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/1/125}
}
@article{Bock2002New,
  author = {Bock, J. R. and Gough, D. A.},
  title = {A {N}ew {M}ethod to {E}stimate {L}igand-{R}eceptor {E}nergetics},
  journal = {Mol {C}ell {P}roteomics},
  year = {2002},
  volume = {1},
  pages = {904-910},
  number = {11},
  abstract = {In the discovery of new drugs, lead identification and optimization
	have assumed critical importance given the number of drug targets
	generated from genetic, genomics, and proteomic technologies. {H}igh-throughput
	experimental screening assays have been complemented recently by
	"virtual screening" approaches to identify and filter potential ligands
	when the characteristics of a target receptor structure of interest
	are known. {V}irtual screening mandates a reliable procedure for
	automatic ranking of structurally distinct ligands in compound library
	databases. {C}omputing a rank score requires the accurate prediction
	of binding affinities between these ligands and the target. {M}any
	current scoring strategies require information about the target three-dimensional
	structure. {I}n this study, a new method to estimate the free binding
	energy between a ligand and receptor is proposed. {W}e extend a central
	idea previously reported ({B}ock, {J}. {R}., and {G}ough, {D}. {A}.
	(2001) {P}redicting protein-protein interactions from primary structure.
	{B}ioinformatics 17, 455-460; {B}ock, {J}. {R}., and {G}ough, {D}.
	{A}. (2002) {W}hole-proteome interaction mining. {B}ioinformatics,
	in press) that uses simple descriptors to represent biomolecules
	as input examples to train a support vector machine ({S}mola, {A}.
	{J}., and {S}cholkopf, {B}. (1998) {A} {T}utorial on {S}upport {V}ector
	{R}egression, {N}euro{COLT} {T}echnical {R}eport {NC}-{TR}-98-030,
	{R}oyal {H}olloway {C}ollege, {U}niversity of {L}ondon, {UK}) and
	the application of the trained system to previously unseen pairs,
	estimating their propensity for interaction. {H}ere we seek to learn
	the function that maps features of a receptor-ligand pair onto their
	equilibrium free binding energy. {T}hese features do not comprise
	any direct information about the three-dimensional structures of
	ligand or target. {I}n cross-validation experiments, it is demonstrated
	that objective measurements of prediction error rate and rank-ordering
	statistics are competitive with those of several other investigations,
	most of which depend on three-dimensional structural data. {T}he
	size of the sample (n = 2,671) indicates that this approach is robust
	and may have widespread applicability beyond restricted families
	of receptor types. {I}t is concluded that newly sequenced proteins,
	or those for which three-dimensional crystal structures are not easily
	obtained, can be rapidly analyzed for their binding potential against
	a library of ligands using this methodology.},
  pdf = {../local/Bock2002New.pdf},
  file = {Bock2002New.pdf:local/Bock2002New.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.mcponline.org/cgi/content/abstract/1/11/904}
}
@article{Bock2001Predicting,
  author = {Bock, J. R. and Gough, D. A.},
  title = {Predicting protein-protein interactions from primary structure},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {455--460},
  number = {5},
  pdf = {../local/bock01.pdf},
  file = {bock01.pdf:local/bock01.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/17/5/455.pdf}
}
@article{Boobis2002In,
  author = {A. Boobis and U. Gundert-Remy and P. Kremers and P. Macheras and
	O. Pelkonen},
  title = {{I}n silico prediction of {ADME} and pharmacokinetics. {R}eport of
	an expert meeting organised by {COST} {B}15.},
  journal = {Eur. J. Pharm. Sci.},
  year = {2002},
  volume = {17},
  pages = {183--193},
  number = {4-5},
  month = {Dec},
  abstract = {The computational approach is one of the newest and fastest developing
	techniques in pharmacokinetics, ADME (absorption, distribution, metabolism,
	excretion) evaluation, drug discovery and toxicity. However, to date,
	the software packages devoted to ADME prediction, especially of metabolism,
	have not yet been adequately validated and still require improvements
	to be effective. Most are 'open' systems, under constant evolution
	and able to incorporate rapidly, and often easily, new information
	from user or developer databases. Quantitative in silico predictions
	are now possible for several pharmacokinetic (PK) parameters, particularly
	absorption and distribution. The emerging consensus is that the predictions
	are no worse than those made using in vitro tests, with the decisive
	advantage that much less investment in technology, resources and
	time is needed. In addition, and of critical importance, it is possible
	to screen virtual compounds. Some packages are able to handle thousands
	of molecules in a few hours. However, common experience shows that,
	in part at least for essentially irrational reasons, there is currently
	a lack of confidence in these approaches. An effort should be made
	by the software producers towards more transparency, in order to
	improve the confidence of their consumers. It seems highly probable
	that in silico approaches will evolve rapidly, as did in vitro methods
	during the last decade. Past experience with the latter should be
	helpful in avoiding repetition of similar errors and in taking the
	necessary steps to ensure effective implementation. A general concern
	is the lack of access to the large amounts of data on compounds no
	longer in development, but still kept secret by the pharmaceutical
	industry. Controlled access to these data could be particularly helpful
	in validating new in silico approaches.},
  keywords = {Adsorption, Biological Availability, Chemical, Computer Simulation,
	Models, Pharmaceutical, Pharmaceutical Preparations, Predictive Value
	of Tests, Software, Technology, 12453607},
  owner = {mahe},
  pii = {S0928098702001859},
  pmid = {12453607},
  timestamp = {2006.08.16}
}
@article{Bordner2005Statistical,
  author = {Andrew J Bordner and Ruben Abagyan},
  title = {Statistical analysis and prediction of protein-protein interfaces.},
  journal = {Proteins},
  year = {2005},
  volume = {60},
  pages = {353-66},
  number = {3},
  month = {Aug},
  abstract = {Predicting protein-protein interfaces from a three-dimensional structure
	is a key task of computational structural proteomics. {I}n contrast
	to geometrically distinct small molecule binding sites, protein-protein
	interface are notoriously difficult to predict. {W}e generated a
	large nonredundant data set of 1494 true protein-protein interfaces
	using biological symmetry annotation where necessary. {T}he data
	set was carefully analyzed and a {S}upport {V}ector {M}achine was
	trained on a combination of a new robust evolutionary conservation
	signal with the local surface properties to predict protein-protein
	interfaces. {F}ivefold cross validation verifies the high sensitivity
	and selectivity of the model. {A}s much as 97\% of the predicted
	patches had an overlap with the true interface patch while only 22\%
	of the surface residues were included in an average predicted patch.
	{T}he model allowed the identification of potential new interfaces
	and the correction of mislabeled oligomeric states.},
  doi = {10.1002/prot.20433},
  pdf = {../local/Bordner2005Statistical.pdf},
  file = {Bordner2005Statistical.pdf:local/Bordner2005Statistical.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1002/prot.20433}
}
@article{Borgwardt2005Protein,
  author = {Borgwardt, K.M. and Ong, C.S. and Sch{\"o}nauer, S. and Vishwanathan,
	S.V.N. and Smola, A.J. and Kriegel, H.-P.},
  title = {Protein function prediction via graph kernels.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i47-i56},
  number = {Suppl. 1},
  month = {Jun},
  abstract = {M{OTIVATION}: {C}omputational approaches to protein function prediction
	infer protein function by finding proteins with similar sequence,
	structure, surface clefts, chemical properties, amino acid motifs,
	interaction partners or phylogenetic profiles. {W}e present a new
	approach that combines sequential, structural and chemical information
	into one graph model of proteins. {W}e predict functional class membership
	of enzymes and non-enzymes using graph kernels and support vector
	machine classification on these protein graphs. {RESULTS}: {O}ur
	graph model, derivable from protein sequence and structure only,
	is competitive with vector models that require additional protein
	information, such as the size of surface pockets. {I}f we include
	this extra information into our graph model, our classifier yields
	significantly higher accuracy levels than the vector models. {H}yperkernels
	allow us to select and to optimally combine the most relevant node
	attributes in our protein graphs. {W}e have laid the foundation for
	a protein function prediction system that integrates protein information
	from various sources efficiently and effectively. {AVAILABILITY}:
	{M}ore information available via www.dbs.ifi.lmu.de/{M}itarbeiter/borgwardt.html.
	{CONTACT}: borgwardt@dbs.ifi.lmu.de.},
  doi = {10.1093/bioinformatics/bti1007},
  pdf = {../local/Borgwardt2005Protein.pdf},
  file = {Borgwardt2005Protein.pdf:local/Borgwardt2005Protein.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i47},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1007}
}
@article{Bosshard2001Molecular,
  author = {H. R. Bosshard},
  title = {Molecular recognition by induced fit: how fit is the concept?},
  journal = {News Physiol Sci},
  year = {2001},
  volume = {16},
  pages = {171--173},
  month = {Aug},
  abstract = {Induced fit explains why biomolecules can bind together even if they
	are not optimized for binding. However, induced fit can lead to a
	kinetic bottleneck and does not describe every interaction in the
	absence of prior complementarity. Preselection of a fitting conformer
	is an alternative to induced fit.},
  institution = {and.},
  keywords = {Antigen-Antibody Complex, physiology; Biological Products, chemistry/metabolism;
	Models, Biological; Molecular Conformation},
  owner = {bricehoffmann},
  pmid = {11479367},
  timestamp = {2009.02.13}
}
@article{Bowd2002Comparing,
  author = {Christopher Bowd and Kwokleung Chan and Linda M Zangwill and Michael
	H Goldbaum and Te-Won Lee and Terrence J Sejnowski and Robert N Weinreb},
  title = {Comparing neural networks and linear discriminant functions for glaucoma
	detection using confocal scanning laser ophthalmoscopy of the optic
	disc.},
  journal = {Invest {O}phthalmol {V}is {S}ci},
  year = {2002},
  volume = {43},
  pages = {3444-54},
  number = {11},
  month = {Nov},
  abstract = {P{URPOSE}: {T}o determine whether neural network techniques can improve
	differentiation between glaucomatous and nonglaucomatous eyes, using
	the optic disc topography parameters of the {H}eidelberg {R}etina
	{T}omograph ({HRT}; {H}eidelberg {E}ngineering, {H}eidelberg, {G}ermany).
	{METHODS}: {W}ith the {HRT}, one eye was imaged from each of 108
	patients with glaucoma (defined as having repeatable visual field
	defects with standard automated perimetry) and 189 subjects without
	glaucoma (no visual field defects with healthy-appearing optic disc
	and retinal nerve fiber layer on clinical examination) and the optic
	nerve topography was defined by 17 global and 66 regional {HRT} parameters.
	{W}ith all the {HRT} parameters used as input, receiver operating
	characteristic ({ROC}) curves were generated for the classification
	of eyes, by three neural network techniques: linear and {G}aussian
	support vector machines ({SVM} linear and {SVM} {G}aussian, respectively)
	and a multilayer perceptron ({MLP}), as well as four previously proposed
	linear discriminant functions ({LDF}s) and one {LDF} developed on
	the current data with all {HRT} parameters used as input. {RESULTS}:
	{T}he areas under the {ROC} curves for {SVM} linear and {SVM} {G}aussian
	were 0.938 and 0.945, respectively; for {MLP}, 0.941; for the current
	{LDF}, 0.906; and for the best previously proposed {LDF}, 0.890.
	{W}ith the use of forward selection and backward elimination optimization
	techniques, the areas under the {ROC} curves for {SVM} {G}aussian
	and the current {LDF} were increased to approximately 0.96. {CONCLUSIONS}:
	{T}rained neural networks, with global and regional {HRT} parameters
	used as input, improve on previously proposed {HRT} parameter-based
	{LDF}s for discriminating between glaucomatous and nonglaucomatous
	eyes. {T}he performance of both neural networks and {LDF}s can be
	improved with optimization of the features in the input. {N}eural
	network analyses show promise for increasing diagnostic accuracy
	of tests for glaucoma.},
  pdf = {../local/Bowd2002Comparing.pdf},
  file = {Bowd2002Comparing.pdf:local/Bowd2002Comparing.pdf:PDF},
  keywords = {Acute, Algorithms, Animals, Anion Exchange Resins, Artificial Intelligence,
	Automated, Base Pair Mismatch, Base Pairing, Base Sequence, Biological,
	Biosensing Techniques, Carcinoma, Chemical, Chromatography, Citric
	Acid Cycle, Classification, Cluster Analysis, Comparative Study,
	Computational Biology, Computer-Assisted, Cystadenoma, DNA, Databases,
	Decision Making, Diagnosis, Differential, Discriminant Analysis,
	Drug, Drug Design, Electrostatics, Eukaryotic Cells, Factual, Feasibility
	Studies, Female, Gene Expression, Gene Expression Profiling, Gene
	Expression Regulation, Genes, Genetic, Genetic Heterogeneity, Genetic
	Markers, Glaucoma, Hemolysins, Humans, Internet, Intraocular Pressure,
	Ion Exchange, Lasers, Leukemia, Ligands, Likelihood Functions, Logistic
	Models, Lung Neoplasms, Lymphocytic, Lymphoma, Markov Chains, Mathematics,
	Messenger, Models, Molecular, Molecular Probe Techniques, Molecular
	Sequence Data, Nanotechnology, Neoplasm, Neoplasms, Neoplastic, Neural
	Networks (Computer), Non-P.H.S., Non-Small-Cell Lung, Non-U.S. Gov't,
	Nucleic Acid Conformation, Nucleic Acid Hybridization, Observer Variation,
	Oligonucleotide Array Sequence Analysis, Open-Angle, Ophthalmoscopy,
	Optic Disk, Ovarian Neoplasms, P.H.S., Pattern Recognition, Probability,
	Probability Learning, Protein Binding, Protein Conformation, Proteins,
	Quality Control, Quantum Theory, RNA, RNA Splicing, ROC Curve, Receptors,
	Reference Values, Regression Analysis, Reproducibility of Results,
	Research Support, Robotics, Saccharomyces cerevisiae Proteins, Sensitivity
	and Specificity, Sequence Analysis, Signal Processing, Software,
	Statistical, Stomach Neoplasms, Structural, Structure-Activity Relationship,
	Thermodynamics, Transcription, Tumor Markers, U.S. Gov't, 12407155},
  url = {http://www.iovs.org/cgi/content/abstract/43/11/3444}
}
@article{Bowd2004Confocal,
  author = {Christopher Bowd and Linda M Zangwill and Felipe A Medeiros and Jiucang
	Hao and Kwokleung Chan and Te-Won Lee and Terrence J Sejnowski and
	Michael H Goldbaum and Pamela A Sample and Jonathan G Crowston and
	Robert N Weinreb},
  title = {Confocal scanning laser ophthalmoscopy classifiers and stereophotograph
	evaluation for prediction of visual field abnormalities in glaucoma-suspect
	eyes.},
  journal = {Invest {O}phthalmol {V}is {S}ci},
  year = {2004},
  volume = {45},
  pages = {2255-62},
  number = {7},
  month = {Jul},
  abstract = {P{URPOSE}: {T}o determine whether {H}eidelberg {R}etina {T}omograph
	({HRT}; {H}eidelberg {E}ngineering, {D}ossenheim, {G}ermany) classification
	techniques and investigational support vector machine ({SVM}) analyses
	can detect optic disc abnormalities in glaucoma-suspect eyes before
	the development of visual field abnormalities. {METHODS}: {G}laucoma-suspect
	eyes (n = 226) were classified as converts or nonconverts based on
	the development of repeatable (either two or three consecutive) standard
	automated perimetry ({SAP})-detected abnormalities over the course
	of the study (mean follow-up, approximately 4.5 years). {H}azard
	ratios for development of {SAP} abnormalities were calculated based
	on baseline classification results, follow-up time, and end point
	status (convert, nonconvert). {C}lassification techniques applied
	were {HRT} classification ({HRTC}), {M}oorfields {R}egression {A}nalysis,
	forward-selection optimized {SVM} ({SVM} fwd) and backward elimination-optimized
	{SVM} ({SVM} back) analysis of {HRT} data, and stereophotograph assessment.
	{RESULTS}: {U}nivariate analyses indicated that all classification
	techniques were predictors of the development of two repeatable abnormal
	{SAP} results, with hazards ratios (95\% confidence interval [{CI}])
	ranging from 1.32 (1.00-1.75) for {HRTC} to 2.0 (1.48-2.76) for stereophotograph
	assessment (all {P} < or = 0.05). {O}nly {SVM} ({SVM} fwd and {SVM}
	back) analysis of {HRT} data and stereophotograph assessment were
	univariate predictors of the development of three repeatable abnormal
	{SAP} results, with hazard ratios (95\% {CI}) ranging from 1.73 (1.16-2.82)
	for {SVM} fwd to 1.82 (1.19-3.12) for {SVM} back (both {P} < 0.007).
	{M}ultivariate analyses including each classification technique individually
	in a model with age, baseline {SAP} pattern standard deviation [{PSD}],
	and baseline {IOP} indicated that all classification techniques except
	{HRTC} ({P} = 0.06) were predictors of the development of two repeatable
	abnormal {SAP} results with hazards ratios ranging from 1.30 (0.99,
	1.73) for {HRTC} to 1.90 (1.37, 2.69) for stereophotograph assessment.
	{O}nly {SVM} ({SVM} fwd and {SVM} back) analysis of {HRT} data and
	stereophotograph assessment were significant predictors of the development
	of three repeatable abnormal {SAP} results in multivariate analyses;
	hazard ratios of 1.57 (1.03, 2.59) and 1.70 (1.18, 2.51), respectively.
	{SAP} {PSD} was a significant predictor of two repeatable abnormal
	{SAP} results in multivariate models with all classification techniques,
	with hazard ratios ranging from 3.31 (1.39, 7.89) to 4.70 (2.02,
	10.93) per 1-d{B} increase. {CONCLUSIONS}: {HRT} classifications
	techniques and stereophotograph assessment can detect optic disc
	topography abnormalities in glaucoma-suspect eyes before the development
	of {SAP} abnormalities. {T}hese data support strongly the importance
	of optic disc examination for early glaucoma diagnosis.},
  doi = {10.1167/iovs.03-1087},
  pdf = {../local/Bowd2004Confocal.pdf},
  file = {Bowd2004Confocal.pdf:local/Bowd2004Confocal.pdf:PDF},
  keywords = {80 and over, Adolescent, Adult, Aged, Algorithms, Artificial Intelligence,
	Auditory, Benchmarking, Binding Sites, Brain Stem, Breast Diseases,
	Chemical, Child, Chromosomes, Comparative Study, Computational Biology,
	Computer Simulation, Computer-Assisted, Data Interpretation, Databases,
	Diagnosis, Diagnostic Errors, Differential, Drug Resistance, Electroencephalography,
	Epilepsy, Evoked Potentials, Female, Forecasting, Gene Expression,
	Gene Expression Profiling, Genetic, Genotype, Glaucoma, Greece, HIV
	Protease Inhibitors, HIV-1, Human, Humans, Infant, Information Management,
	Information Storage and Retrieval, Intraocular Pressure, Kinetics,
	Language Development Disorders, Lasers, Least-Squares Analysis, Linear
	Models, Male, Microbial Sensitivity Tests, Middle Aged, Models, Molecular,
	Monitoring, Nephroblastoma, Non-U.S. Gov't, Nonlinear Dynamics, Ocular
	Hypertension, Oligonucleotide Array Sequence Analysis, Ophthalmoscopy,
	Optic Disk, Optic Nerve Diseases, P.H.S., Pair 1, Perimetry, Periodicals,
	Phosphorylation, Phosphotransferases, Photography, Physiologic, Point
	Mutation, Preschool, Prognosis, Protein, Proteins, Pyrimidinones,
	Reaction Time, Recurrence, Reproducibility of Results, Research Support,
	Reverse Transcriptase Inhibitors, Sensitivity and Specificity, Sequence
	Alignment, Sequence Analysis, Signal Processing, Software, Sound
	Localization, Statistical, Stochastic Processes, Structure-Activity
	Relationship, Theoretical, Time Factors, U.S. Gov't, Viral, Vision
	Disorders, Visual Fields, 15223803},
  url = {http://dx.doi.org/10.1167/iovs.03-1087}
}
@article{Bradford2005Improved,
  author = {James R Bradford and David R Westhead},
  title = {Improved prediction of protein-protein binding sites using a support
	vector machines approach.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1487-94},
  number = {8},
  month = {Apr},
  abstract = {M{OTIVATION}: {S}tructural genomics projects are beginning to produce
	protein structures with unknown function, therefore, accurate, automated
	predictors of protein function are required if all these structures
	are to be properly annotated in reasonable time. {I}dentifying the
	interface between two interacting proteins provides important clues
	to the function of a protein and can reduce the search space required
	by docking algorithms to predict the structures of complexes. {RESULTS}:
	{W}e have combined a support vector machine ({SVM}) approach with
	surface patch analysis to predict protein-protein binding sites.
	{U}sing a leave-one-out cross-validation procedure, we were able
	to successfully predict the location of the binding site on 76\%
	of our dataset made up of proteins with both transient and obligate
	interfaces. {W}ith heterogeneous cross-validation, where we trained
	the {SVM} on transient complexes to predict on obligate complexes
	(and vice versa), we still achieved comparable success rates to the
	leave-one-out cross-validation suggesting that sufficient properties
	are shared between transient and obligate interfaces. {AVAILABILITY}:
	{A} web application based on the method can be found at http://www.bioinformatics.leeds.ac.uk/ppi_pred.
	{T}he dataset of 180 proteins used in this study is also available
	via the same web site. {CONTACT}: westhead@bmb.leeds.ac.uk {SUPPLEMENTARY}
	{INFORMATION}: http://www.bioinformatics.leeds.ac.uk/ppi-pred/supp-material.},
  doi = {10.1093/bioinformatics/bti242},
  pdf = {../local/Bradford2005Improved.pdf},
  file = {Bradford2005Improved.pdf:local/Bradford2005Improved.pdf:PDF},
  keywords = {biosvm},
  pii = {bti242},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti242}
}
@article{Brazma2001Minimum,
  author = {Brazma, A. and Hingamp, P. and Quackenbush, J. and Sherlock, G. and
	Spellman, P. and Stoeckert, C. and Aach, J. and Ansorge, W. and Ball,
	C. A. and Causton, H. C. and Gaasterland, T. and Glenisson, P. and
	Holstege, F. C. and Kim, I. F. and Markowitz, V. and Matese, J. C.
	and Parkinson, H. and Robinson, A. and Sarkans, U. and Schulze-Kremer,
	S. and Stewart, J. and Taylor, R. and Vilo, J. and Vingron, M.},
  title = {Minimum information about a microarray experiment (MIAME)-toward
	standards for microarray data.},
  journal = {Nat. Genet.},
  year = {2001},
  volume = {29},
  pages = {365--371},
  number = {4},
  month = {Dec},
  abstract = {Microarray analysis has become a widely used tool for the generation
	of gene expression data on a genomic scale. Although many significant
	results have been derived from microarray studies, one limitation
	has been the lack of standards for presenting and exchanging such
	data. Here we present a proposal, the Minimum Information About a
	Microarray Experiment (MIAME), that describes the minimum information
	required to ensure that microarray data can be easily interpreted
	and that results derived from its analysis can be independently verified.
	The ultimate goal of this work is to establish a standard for recording
	and reporting microarray-based gene expression data, which will in
	turn facilitate the establishment of databases and public repositories
	and enable the development of data analysis tools. With respect to
	MIAME, we concentrate on defining the content and structure of the
	necessary information rather than the technical format for capturing
	it.},
  doi = {10.1038/ng1201-365},
  institution = {European Bioinformatics Institute, EMBL outstation, Wellcome Trust
	Genome Campus, Hinxton, Cambridge CB10 1SD, UK. brazma@ebi.ac.uk},
  keywords = {Computational Biology; Gene Expression Profiling, methods; Oligonucleotide
	Array Sequence Analysis, standards},
  language = {eng},
  medline-pst = {ppublish},
  owner = {phupe},
  pii = {ng1201-365},
  pmid = {11726920},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1038/ng1201-365}
}
@article{Briem2005Classifying,
  author = {Hans Briem and Judith G{\"u}nther},
  title = {Classifying "kinase inhibitor-likeness" by using machine-learning
	methods.},
  journal = {ChemBioChem},
  year = {2005},
  volume = {6},
  pages = {558-66},
  number = {3},
  month = {Mar},
  abstract = {By using an in-house data set of small-molecule structures, encoded
	by {G}hose-{C}rippen parameters, several machine learning techniques
	were applied to distinguish between kinase inhibitors and other molecules
	with no reported activity on any protein kinase. {A}ll four approaches
	pursued--support-vector machines ({SVM}), artificial neural networks
	({ANN}), k nearest neighbor classification with {GA}-optimized feature
	selection ({GA}/k{NN}), and recursive partitioning ({RP})--proved
	capable of providing a reasonable discrimination. {N}evertheless,
	substantial differences in performance among the methods were observed.
	{F}or all techniques tested, the use of a consensus vote of the 13
	different models derived improved the quality of the predictions
	in terms of accuracy, precision, recall, and {F}1 value. {S}upport-vector
	machines, followed by the {GA}/k{NN} combination, outperformed the
	other techniques when comparing the average of individual models.
	{B}y using the respective majority votes, the prediction of neural
	networks yielded the highest {F}1 value, followed by {SVM}s.},
  doi = {10.1002/cbic.200400109},
  pdf = {../local/Briem2005Classifying.pdf},
  file = {Briem2005Classifying.pdf:local/Briem2005Classifying.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1002/cbic.200400109}
}
@article{Briggs2002Gene,
  author = {Scott D Briggs and Tiaojiang Xiao and Zu-Wen Sun and Jennifer A Caldwell
	and Jeffrey Shabanowitz and Donald F Hunt and C. David Allis and
	Brian D Strahl},
  title = {Gene silencing: trans-histone regulatory pathway in chromatin.},
  journal = {Nature},
  year = {2002},
  volume = {418},
  pages = {498},
  number = {6897},
  month = {Aug},
  abstract = {The fundamental unit of eukaryotic chromatin, the nucleosome, consists
	of genomic DNA wrapped around the conserved histone proteins H3,
	H2B, H2A and H4, all of which are variously modified at their amino-
	and carboxy-terminal tails to influence the dynamics of chromatin
	structure and function -- for example, conjugation of histone H2B
	with ubiquitin controls the outcome of methylation at a specific
	lysine residue (Lys 4) on histone H3, which regulates gene silencing
	in the yeast Saccharomyces cerevisiae. Here we show that ubiquitination
	of H2B is also necessary for the methylation of Lys 79 in H3, the
	only modification known to occur away from the histone tails, but
	that not all methylated lysines in H3 are regulated by this 'trans-histone'
	pathway because the methylation of Lys 36 in H3 is unaffected. Given
	that gene silencing is regulated by the methylation of Lys 4 and
	Lys 79 in histone H3, we suggest that H2B ubiquitination acts as
	a master switch that controls the site-selective histone methylation
	patterns responsible for this silencing.},
  doi = {10.1038/nature00970},
  institution = {Department of Biochemistry and Molecular Genetics, University of
	Virginia Health System, Charlottesville, Virginia 22908, USA.},
  keywords = {Chromatin, chemistry/metabolism; Gene Expression Regulation, Fungal;
	Gene Silencing; Histone-Lysine N-Methyltransferase; Histones, chemistry/metabolism;
	Ligases, metabolism; Methylation; Models, Biological; Nuclear Proteins,
	metabolism; Saccharomyces cerevisiae Proteins; Saccharomyces cerevisiae,
	genetics/metabolism; Ubiquitin, metabolism; Ubiquitin-Conjugating
	Enzymes},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pii = {nature00970},
  pmid = {12152067},
  timestamp = {2010.11.23},
  url = {http://dx.doi.org/10.1038/nature00970}
}
@article{Brown2000Knowledge-based,
  author = {Brown, M. P. and Grundy, W. N. and Lin, D. and Cristianini, N. and
	Sugnet, C. W. and Furey, T. S. and Ares, M. and Haussler, D.},
  title = {Knowledge-based analysis of microarray gene expression data by using
	support vector machines.},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {2000},
  volume = {97},
  pages = {262-7},
  number = {1},
  month = {Jan},
  abstract = {We introduce a method of functionally classifying genes by using gene
	expression data from {DNA} microarray hybridization experiments.
	{T}he method is based on the theory of support vector machines ({SVM}s).
	{SVM}s are considered a supervised computer learning method because
	they exploit prior knowledge of gene function to identify unknown
	genes of similar function from expression data. {SVM}s avoid several
	problems associated with unsupervised clustering methods, such as
	hierarchical clustering and self-organizing maps. {SVM}s have many
	mathematical features that make them attractive for gene expression
	analysis, including their flexibility in choosing a similarity function,
	sparseness of solution when dealing with large data sets, the ability
	to handle large feature spaces, and the ability to identify outliers.
	{W}e test several {SVM}s that use different similarity metrics, as
	well as some other supervised learning methods, and find that the
	{SVM}s best identify sets of genes with a common function using expression
	data. {F}inally, we use {SVM}s to predict functional roles for uncharacterized
	yeast {ORF}s based on their expression data.},
  pdf = {../local/Brown2000Knowledge-based.pdf},
  file = {Brown2000Knowledge-based.pdf:local/Brown2000Knowledge-based.pdf:PDF},
  keywords = {biosvm microarray},
  url = {http://www.pnas.org/cgi/content/abstract/97/1/262}
}
@article{Brusic2002Prediction,
  author = {Brusic, V. and Petrovsky, N. and Zhang, G. and Bajic, V. B.},
  title = {{P}rediction of promiscuous peptides that bind {HLA} class {I} molecules.},
  journal = {Immunol. Cell Biol.},
  year = {2002},
  volume = {80},
  pages = {280--285},
  number = {3},
  month = {Jun},
  abstract = {Promiscuous T-cell epitopes make ideal targets for vaccine development.
	We report here a computational system, MULTIPRED, for the prediction
	of peptide binding to the HLA-A2 supertype. It combines a novel representation
	of peptide/MHC interactions with a hidden Markov model as the prediction
	algorithm. MULTIPREDis both sensitive and specific, and demonstrates
	high accuracy of peptide-binding predictions for HLA-A*0201, *0204,
	and *0205 alleles, good accuracy for *0206 allele, and marginal accuracy
	for *0203 allele. MULTIPREDreplaces earlier requirements for individual
	prediction models for each HLA allelic variant and simplifies computational
	aspects of peptide-binding prediction. Preliminary testing indicates
	that MULTIPRED can predict peptide binding to HLA-A2 supertype molecules
	with high accuracy, including those allelic variants for which no
	experimental binding data are currently available.},
  keywords = {Algorithms, Amino Acid Motifs, Amino Acid Sequence, Antigen-Antibody
	Complex, Automated, Binding Sites, Computational Biology, Drug Delivery
	Systems, Drug Design, Epitopes, Forecasting, Genes, HLA Antigens,
	HLA-A Antigens, HLA-A2 Antigen, HLA-DR Antigens, Humans, Internet,
	MHC Class I, Markov Chains, Molecular Sequence Data, Neural Networks
	(Computer), Pattern Recognition, Peptide Fragments, Peptides, Protein,
	Protein Binding, Protein Interaction Mapping, Sensitivity and Specificity,
	Sequence Analysis, Software, T-Lymphocyte, User-Computer Interface,
	Viral Vaccines, 12067415},
  pii = {1088},
  pmid = {12067415},
  timestamp = {2007.01.25}
}
@article{Bui2006Structural,
  author = {Bui, H.-H. and Schiewe, A. J. and von Grafenstein, H. and Haworth,
	I. S.},
  title = {{S}tructural prediction of peptides binding to {MHC} class {I} molecules.},
  journal = {Proteins},
  year = {2006},
  volume = {63},
  pages = {43--52},
  number = {1},
  month = {Apr},
  abstract = {Peptide binding to class I major histocompatibility complex (MHCI)
	molecules is a key step in the immune response and the structural
	details of this interaction are of importance in the design of peptide
	vaccines. Algorithms based on primary sequence have had success in
	predicting potential antigenic peptides for MHCI, but such algorithms
	have limited accuracy and provide no structural information. Here,
	we present an algorithm, PePSSI (peptide-MHC prediction of structure
	through solvated interfaces), for the prediction of peptide structure
	when bound to the MHCI molecule, HLA-A2. The algorithm combines sampling
	of peptide backbone conformations and flexible movement of MHC side
	chains and is unique among other prediction algorithms in its incorporation
	of explicit water molecules at the peptide-MHC interface. In an initial
	test of the algorithm, PePSSI was used to predict the conformation
	of eight peptides bound to HLA-A2, for which X-ray data are available.
	Comparison of the predicted and X-ray conformations of these peptides
	gave RMSD values between 1.301 and 2.475 A. Binding conformations
	of 266 peptides with known binding affinities for HLA-A2 were then
	predicted using PePSSI. Structural analyses of these peptide-HLA-A2
	conformations showed that peptide binding affinity is positively
	correlated with the number of peptide-MHC contacts and negatively
	correlated with the number of interfacial water molecules. These
	results are consistent with the relatively hydrophobic binding nature
	of the HLA-A2 peptide binding interface. In summary, PePSSI is capable
	of rapid and accurate prediction of peptide-MHC binding conformations,
	which may in turn allow estimation of MHCI-peptide binding affinity.},
  doi = {10.1002/prot.20870},
  keywords = {Algorithms, Amino Acid Sequence, Antigens, Artificial Intelligence,
	Automated, Binding Sites, Chemical, Computational Biology, Computer
	Simulation, Crystallog, Crystallography, Electrostatics, Genes, Genetic,
	HLA Antigens, Histocompatibility Antigens Class I, Humans, Hydrogen
	Bonding, Ligands, MHC Class I, Major Histocompatibility Complex,
	Models, Molecular, Molecular Conformation, Molecular Sequence Data,
	Pattern Recognition, Peptides, Protein, Protein Binding, Protein
	Conformation, Proteomics, Quantitative Structure-Activity Relationship,
	Sequence Alignment, Sequence Analysis, Software, Structural Homology,
	Structure-Activity Relationship, Thermodynamics, Water, X-Ray, X-Rays,
	raphy, 16447245},
  pmid = {16447245},
  timestamp = {2007.01.25},
  url = {http://dx.doi.org/10.1002/prot.20870}
}
@article{Bui2005Automated,
  author = {Huynh-Hoa Bui and John Sidney and Bjoern Peters and Muthuraman Sathiamurthy
	and Asabe Sinichi and Kelly-Anne Purton and Bianca R Moth\'e and
	Francis V Chisari and David I Watkins and Alessandro Sette},
  title = {Automated generation and evaluation of specific MHC binding predictive
	tools: ARB matrix applications.},
  journal = {Immunogenetics},
  year = {2005},
  volume = {57},
  pages = {304--314},
  number = {5},
  month = {Jun},
  abstract = {Prediction of which peptides can bind major histocompatibility complex
	(MHC) molecules is commonly used to assist in the identification
	of T cell epitopes. However, because of the large numbers of different
	MHC molecules of interest, each associated with different predictive
	tools, tool generation and evaluation can be a very resource intensive
	task. A methodology commonly used to predict MHC binding affinity
	is the matrix or linear coefficients method. Herein, we described
	Average Relative Binding (ARB) matrix methods that directly predict
	IC(50) values allowing combination of searches involving different
	peptide sizes and alleles into a single global prediction. A computer
	program was developed to automate the generation and evaluation of
	ARB predictive tools. Using an in-house MHC binding database, we
	generated a total of 85 and 13 MHC class I and class II matrices,
	respectively. Results from the automated evaluation of tool efficiency
	are presented. We anticipate that this automation framework will
	be generally applicable to the generation and evaluation of large
	numbers of MHC predictive methods and tools, and will be of value
	to centralize and rationalize the process of evaluation of MHC predictions.
	MHC binding predictions based on ARB matrices were made available
	at http://epitope.liai.org:8080/matrix web server.},
  doi = {10.1007/s00251-005-0798-y},
  keywords = {Animals; Binding Sites; Computer Simulation; Databases, Protein; Epitopes;
	Histocompatibility Antigens; Humans; Major Histocompatibility Complex;
	Models, Biological; Protein Binding},
  owner = {laurent},
  pmid = {15868141},
  timestamp = {2007.07.12},
  url = {http://dx.doi.org/10.1007/s00251-005-0798-y}
}
@article{Bullard2010Evaluation,
  author = {Bullard, J. H. and Purdom, E. and Hansen, K. D. and Dudoit, S.},
  title = {Evaluation of statistical methods for normalization and differential
	expression in mRNA-Seq experiments.},
  journal = {BMC Bioinformatics},
  year = {2010},
  volume = {11},
  pages = {94},
  abstract = {High-throughput sequencing technologies, such as the Illumina Genome
	Analyzer, are powerful new tools for investigating a wide range of
	biological and medical questions. Statistical and computational methods
	are key for drawing meaningful and accurate conclusions from the
	massive and complex datasets generated by the sequencers. We provide
	a detailed evaluation of statistical methods for normalization and
	differential expression (DE) analysis of Illumina transcriptome sequencing
	(mRNA-Seq) data.We compare statistical methods for detecting genes
	that are significantly DE between two types of biological samples
	and find that there are substantial differences in how the test statistics
	handle low-count genes. We evaluate how DE results are affected by
	features of the sequencing platform, such as, varying gene lengths,
	base-calling calibration method (with and without phi X control lane),
	and flow-cell/library preparation effects. We investigate the impact
	of the read count normalization method on DE results and show that
	the standard approach of scaling by total lane counts (e.g., RPKM)
	can bias estimates of DE. We propose more general quantile-based
	normalization procedures and demonstrate an improvement in DE detection.Our
	results have significant practical and methodological implications
	for the design and analysis of mRNA-Seq experiments. They highlight
	the importance of appropriate statistical methods for normalization
	and DE inference, to account for features of the sequencing platform
	that could impact the accuracy of results. They also reveal the need
	for further research in the development of statistical and computational
	methods for mRNA-Seq.},
  doi = {10.1186/1471-2105-11-94},
  institution = {Division of Biostatistics, University of California, Berkeley, Berkeley,
	CA, USA. bullard@berkeley.edu},
  keywords = {Computational Biology; Databases, Genetic; RNA, Messenger; Sequence
	Analysis, RNA},
  owner = {laurent},
  pii = {1471-2105-11-94},
  pmid = {20167110},
  timestamp = {2012.04.11},
  url = {http://dx.doi.org/10.1186/1471-2105-11-94}
}
@article{Bunescu2005Comparative,
  author = {Bunescu, R. and Ge, R. and Kate, R. J. and Marcotte, E. M. and Mooney,
	R. J. and Ramani, A. K. and Wong, Y. W.},
  title = {Comparative experiments on learning information extractors for proteins
	and their interactions.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2005},
  volume = {33},
  pages = {139-55},
  number = {2},
  month = {Feb},
  abstract = {O{BJECTIVE}: {A}utomatically extracting information from biomedical
	text holds the promise of easily consolidating large amounts of biological
	knowledge in computer-accessible form. {T}his strategy is particularly
	attractive for extracting data relevant to genes of the human genome
	from the 11 million abstracts in {M}edline. {H}owever, extraction
	efforts have been frustrated by the lack of conventions for describing
	human genes and proteins. {W}e have developed and evaluated a variety
	of learned information extraction systems for identifying human protein
	names in {M}edline abstracts and subsequently extracting information
	on interactions between the proteins. {METHODS} {AND} {MATERIAL}:
	{W}e used a variety of machine learning methods to automatically
	develop information extraction systems for extracting information
	on gene/protein name, function and interactions from {M}edline abstracts.
	{W}e present cross-validated results on identifying human proteins
	and their interactions by training and testing on a set of approximately
	1000 manually-annotated {M}edline abstracts that discuss human genes/proteins.
	{RESULTS}: {W}e demonstrate that machine learning approaches using
	support vector machines and maximum entropy are able to identify
	human proteins with higher accuracy than several previous approaches.
	{W}e also demonstrate that various rule induction methods are able
	to identify protein interactions with higher precision than manually-developed
	rules. {CONCLUSION}: {O}ur results show that it is promising to use
	machine learning to automatically build systems for extracting information
	from biomedical text. {T}he results also give a broad picture of
	the relative strengths of a wide variety of methods when tested on
	a reasonably large human-annotated corpus.},
  doi = {10.1016/j.artmed.2004.07.016},
  pdf = {../local/Bunescu2005Comparative.pdf},
  file = {Bunescu2005Comparative.pdf:local/Bunescu2005Comparative.pdf:PDF},
  keywords = {biosvm},
  pii = {S0933-3657(04)00131-9},
  url = {http://dx.doi.org/10.1016/j.artmed.2004.07.016}
}
@article{Burbidge2001Drug,
  author = {Burbidge, R. and Trotter, M. and Buxton, B. and Holden, S.},
  title = {Drug design by machine learning: support vector machines for pharmaceutical
	data analysis},
  journal = {Comput. {C}hem.},
  year = {2001},
  volume = {26},
  pages = {4--15},
  number = {1},
  month = {December},
  pdf = {../local/burb01.pdf},
  file = {burb01.pdf:local/burb01.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  subject = {qsar},
  url = {http://stats.ma.ic.ac.uk/~rdb/pubs/candc-aisb00-rbmt-final.pdf}
}
@article{Burckin2005Exploring,
  author = {Burckin, T. and Nagel, R. and Mandel-Gutfreund, Y. and Shiue, L.
	and Clark, T. A. and Chong, J.-L. and Chang, T.-H. and Squazzo, S.
	and Hartzog, G. and Ares, M.},
  title = {Exploring functional relationships between components of the gene
	expression machinery.},
  journal = {Nat. {S}truct. {M}ol. {B}iol.},
  year = {2005},
  volume = {12},
  pages = {175-82},
  number = {2},
  month = {Feb},
  abstract = {Eukaryotic gene expression requires the coordinated activity of many
	macromolecular machines including transcription factors and {RNA}
	polymerase, the spliceosome, m{RNA} export factors, the nuclear pore,
	the ribosome and decay machineries. {Y}east carrying mutations in
	genes encoding components of these machineries were examined using
	microarrays to measure changes in both pre-m{RNA} and m{RNA} levels.
	{W}e used these measurements as a quantitative phenotype to ask how
	steps in the gene expression pathway are functionally connected.
	{A} multiclass support vector machine was trained to recognize the
	gene expression phenotypes caused by these mutations. {I}n several
	cases, unexpected phenotype assignments by the computer revealed
	functional roles for specific factors at multiple steps in the gene
	expression pathway. {T}he ability to resolve gene expression pathway
	phenotypes provides insight into how the major machineries of gene
	expression communicate with each other.},
  doi = {10.1038/nsmb891},
  pdf = {../local/Burckin2005Exploring.pdf},
  file = {Burckin2005Exploring.pdf:local/Burckin2005Exploring.pdf:PDF},
  keywords = {biosvm microarray},
  pii = {nsmb891},
  url = {http://dx.doi.org/10.1038/nsmb891}
}
@article{Busuttil2004Support,
  author = {Busuttil, S. and Abela, J. and Pace, G. J.},
  title = {Support vector machines with profile-based kernels for remote protein
	homology detection.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2004},
  volume = {15},
  pages = {191-200},
  number = {2},
  abstract = {Two new techniques for remote protein homology detection particulary
	suited for sparse data are introduced. {T}hese methods are based
	on position specific scoring matrices or profiles and use a support
	vector machine ({SVM}) for discrimination. {T}he performance on standard
	benchmarks outperforms previous non-discriminative techniques and
	is comparable to that of other {SVM}-based methods while giving distinct
	advantages.},
  pdf = {../local/Busuttil2004Support.pdf},
  file = {Busuttil2004Support.pdf:local/Busuttil2004Support.pdf:PDF},
  keywords = {biosvm},
  url = {http://www.jsbi.org/journal/GIW04/GIW04F020.html}
}
@article{Byvatov2003Comparison,
  author = {Byvatov, E. and Fechner, U. and Sadowski, J. and Schneider, G.},
  title = {Comparison of support vector machine and artificial neural network
	systems for drug/nondrug classification.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {1882-9},
  number = {6},
  abstract = {Support vector machine ({SVM}) and artificial neural network ({ANN})
	systems were applied to a drug/nondrug classification problem as
	an example of binary decision problems in early-phase virtual compound
	filtering and screening. {T}he results indicate that solutions obtained
	by {SVM} training seem to be more robust with a smaller standard
	error compared to {ANN} training. {G}enerally, the {SVM} classifier
	yielded slightly higher prediction accuracy than {ANN}, irrespective
	of the type of descriptors used for molecule encoding, the size of
	the training data sets, and the algorithm employed for neural network
	training. {T}he performance was compared using various different
	descriptor sets and descriptor combinations based on the 120 standard
	{G}hose-{C}rippen fragment descriptors, a wide range of 180 different
	properties and physicochemical descriptors from the {M}olecular {O}perating
	{E}nvironment ({MOE}) package, and 225 topological pharmacophore
	({CATS}) descriptors. {F}or the complete set of 525 descriptors cross-validated
	classification by {SVM} yielded 82\% correct predictions ({M}atthews
	cc = 0.63), whereas {ANN} reached 80\% correct predictions ({M}atthews
	cc = 0.58). {A}lthough {SVM} outperformed the {ANN} classifiers with
	regard to overall prediction accuracy, both methods were shown to
	complement each other, as the sets of true positives, false positives
	(overprediction), true negatives, and false negatives (underprediction)
	produced by the two classifiers were not identical. {T}he theory
	of {SVM} and {ANN} training is briefly reviewed.},
  doi = {10.1021/ci0341161},
  pdf = {../local/Byvatov2003Comparison.pdf},
  file = {Byvatov2003Comparison.pdf:local/Byvatov2003Comparison.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci0341161}
}
@article{Byvatov2004SVM-based,
  author = {Evgeny Byvatov and Gisbert Schneider},
  title = {S{VM}-based feature selection for characterization of focused compound
	collections.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {993-9},
  number = {3},
  abstract = {Artificial neural networks, the support vector machine ({SVM}), and
	other machine learning methods for the classification of molecules
	are often considered as a "black box", since the molecular features
	that are most relevant for a given classifier are usually not presented
	in a human-interpretable form. {W}e report on an {SVM}-based algorithm
	for the selection of relevant molecular features from a trained classifier
	that might be important for an understanding of ligand-receptor interactions.
	{T}he original {SVM} approach was extended to allow for feature selection.
	{T}he method was applied to characterize focused libraries of enzyme
	inhibitors. {A} comparison with classical {K}olmogorov-{S}mirnov
	({KS})-based feature selection was performed. {I}n most of the applications
	the {SVM} method showed sustained classification accuracy, thereby
	relying on a smaller number of molecular features than {KS}-based
	classifiers. {I}n one case both methods produced comparable results.
	{L}imiting the calculation of descriptors to only the most relevant
	ones for a certain biological activity can also be used to speed
	up high-throughput virtual screening.},
  doi = {10.1021/ci0342876},
  pdf = {../local/Byvatov2004SVM-based.pdf},
  file = {Byvatov2004SVM-based.pdf:local/Byvatov2004SVM-based.pdf:PDF},
  keywords = {biosvm chemoinformatics featureselection},
  url = {http://dx.doi.org/10.1021/ci0342876}
}
@article{Byvatov2003Support,
  author = {E. Byvatov and G. Schneider},
  title = {Support vector machine applications in bioinformatics.},
  journal = {Appl {B}ioinformatics},
  year = {2003},
  volume = {2},
  pages = {67-77},
  number = {2},
  abstract = {The support vector machine ({SVM}) approach represents a data-driven
	method for solving classification tasks. {I}t has been shown to produce
	lower prediction error compared to classifiers based on other methods
	like artificial neural networks, especially when large numbers of
	features are considered for sample description. {I}n this review,
	the theory and main principles of the {SVM} approach are outlined,
	and successful applications in traditional areas of bioinformatics
	research are described. {C}urrent developments in techniques related
	to the {SVM} approach are reviewed which might become relevant for
	future functional genomics and chemogenomics projects. {I}n a comparative
	study, we developed neural network and {SVM} models to identify small
	organic molecules that potentially modulate the function of {G}-protein
	coupled receptors. {T}he {SVM} system was able to correctly classify
	approximately 90\% of the compounds in a cross-validation study yielding
	a {M}atthews correlation coefficient of 0.78. {T}his classifier can
	be used for fast filtering of compound libraries in virtual screening
	applications.},
  keywords = {biosvm}
}
@article{Cai2004Enzyme,
  author = {Cai, C.Z. and Han, L.Y. and Ji, Z.L. and Chen, Y.Z.},
  title = {Enzyme family classification by support vector machines.},
  journal = {Proteins},
  year = {2004},
  volume = {55},
  pages = {66-76},
  number = {1},
  abstract = {One approach for facilitating protein function prediction is to classify
	proteins into functional families. {R}ecent studies on the classification
	of {G}-protein coupled receptors and other proteins suggest that
	a statistical learning method, {S}upport vector machines ({SVM}),
	may be potentially useful for protein classification into functional
	families. {I}n this work, {SVM} is applied and tested on the classification
	of enzymes into functional families defined by the {E}nzyme {N}omenclature
	{C}ommittee of {IUBMB}. {SVM} classification system for each family
	is trained from representative enzymes of that family and seed proteins
	of {P}fam curated protein families. {T}he classification accuracy
	for enzymes from 46 families and for non-enzymes is in the range
	of 50.0% to 95.7% and 79.0% to 100% respectively. {T}he corresponding
	{M}atthews correlation coefficient is in the range of 54.1% to 96.1%.
	{M}oreover, 80.3% of the 8,291 correctly classified enzymes are uniquely
	classified into a specific enzyme family by using a scoring function,
	indicating that {SVM} may have certain level of unique prediction
	capability. {T}esting results also suggest that {SVM} in some cases
	is capable of classification of distantly related enzymes and homologous
	enzymes of different functions. {E}ffort is being made to use a more
	comprehensive set of enzymes as training sets and to incorporate
	multi-class {SVM} classification systems to further enhance the unique
	prediction accuracy. {O}ur results suggest the potential of {SVM}
	for enzyme family classification and for facilitating protein function
	prediction. {O}ur software is accessible at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.},
  doi = {10.1002/prot.20045},
  pdf = {../local/Cai2004Enzyme.pdf},
  file = {Cai2004Enzyme.pdf:local/Cai2004Enzyme.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.20045}
}
@article{Cai2003Protein,
  author = {Cai, C.Z. and Wang, W.L. and Sun, L.Z. and Chen, Y.Z.},
  title = {Protein function classification via support vector machine approach.},
  journal = {Math. {B}iosci.},
  year = {2003},
  volume = {185},
  pages = {111-122},
  number = {2},
  abstract = {Support vector machine ({SVM}) is introduced as a method for the classification
	of proteins into functionally distinguished classes. {S}tudies are
	conducted on a number of protein classes including {RNA}-binding
	proteins; protein homodimers, proteins responsible for drug absorption,
	proteins involved in drug distribution and excretion, and drug metabolizing
	enzymes. {T}esting accuracy for the classification of these protein
	classes is found to be in the range of 84-96%. {T}his suggests the
	usefulness of {SVM} in the classification of protein functional classes
	and its potential application in protein function prediction.},
  doi = {10.1016/S0025-5564(03)00096-8},
  pdf = {../local/Cai2003Protein.pdf},
  file = {Cai2003Protein.pdf:local/Cai2003Protein.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Cai2003SVM-Prot,
  author = {C. Z. Cai and L. Y. Han and Z. L. Ji and X. Chen and Y. Z. Chen},
  title = {S{VM}-{P}rot: {W}eb-based support vector machine software for functional
	classification of a protein from its primary sequence.},
  journal = {Nucleic {A}cids {R}es},
  year = {2003},
  volume = {31},
  pages = {3692-7},
  number = {13},
  month = {Jul},
  abstract = {Prediction of protein function is of significance in studying biological
	processes. {O}ne approach for function prediction is to classify
	a protein into functional family. {S}upport vector machine ({SVM})
	is a useful method for such classification, which may involve proteins
	with diverse sequence distribution. {W}e have developed a web-based
	software, {SVMP}rot, for {SVM} classification of a protein into functional
	family from its primary sequence. {SVMP}rot classification system
	is trained from representative proteins of a number of functional
	families and seed proteins of {P}fam curated protein families. {I}t
	currently covers 54 functional families and additional families will
	be added in the near future. {T}he computed accuracy for protein
	family classification is found to be in the range of 69.1-99.6\%.
	{SVMP}rot shows a certain degree of capability for the classification
	of distantly related proteins and homologous proteins of different
	function and thus may be used as a protein function prediction tool
	that complements sequence alignment methods. {SVMP}rot can be accessed
	at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.},
  pdf = {../local/Cai2003SVM-Prot.pdf},
  file = {Cai2003SVM-Prot.pdf:local/Cai2003SVM-Prot.pdf:PDF},
  keywords = {biosvm},
  url = {http://nar.oxfordjournals.org/cgi/content/abstract/31/13/3692}
}
@article{Cai2003Supportc,
  author = {Cai, Y.D. and Feng, K.Y. and Li, Y.X. and Chou, K.C.},
  title = {Support vector machine for predicting alpha-turn types.},
  journal = {Peptides},
  year = {2003},
  volume = {24},
  pages = {629-630},
  number = {4},
  abstract = {Tight turns play an important role in globular proteins from both
	the structural and functional points of view. {O}f tight turns, beta-turns
	and gamma-turns have been extensively studied, but alpha-turns were
	little investigated. {R}ecently, a systematic search for alpha-turns
	classified alpha-turns into nine different types according to their
	backbone trajectory features. {I}n this paper, {S}upport {V}ector
	{M}achines ({SVM}s), a new machine learning method, is proposed for
	predicting the alpha-turn types in proteins. {T}he high rates of
	correct prediction imply that that the formation of different alpha-turn
	types is evidently correlated with the sequence of a pentapeptide,
	and hence can be approximately predicted based on the sequence information
	of the pentapeptide alone, although the incorporation of its interaction
	with the other part of a protein, the so-called "long distance interaction",
	will further improve the prediction quality.},
  doi = {10.1016/S0196-9781(03)00100-1},
  pdf = {../local/Cai2003Supportc.pdf},
  file = {Cai2003Supportc.pdf:local/Cai2003Supportc.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0196-9781(03)00100-1}
}
@article{Cai2003Supportd,
  author = {Cai, Y.D. and Lin, S.L.},
  title = {Support vector machines for predicting r{RNA}-, {RNA}-, and {DNA}-binding
	proteins from amino acid sequence.},
  journal = {Biochim. {B}iophys. {A}cta},
  year = {2003},
  volume = {1648},
  pages = {127-133},
  number = {1-2},
  abstract = {Classification of gene function remains one of the most important
	and demanding tasks in the post-genome era. {M}ost of the current
	predictive computer methods rely on comparing features that are essentially
	linear to the protein sequence. {H}owever, features of a protein
	nonlinear to the sequence may also be predictive to its function.
	{M}achine learning methods, for instance the {S}upport {V}ector {M}achines
	({SVM}s), are particularly suitable for exploiting such features.
	{I}n this work we introduce {SVM} and the pseudo-amino acid composition,
	a collection of nonlinear features extractable from protein sequence,
	to the field of protein function prediction. {W}e have developed
	prototype {SVM}s for binary classification of r{RNA}-, {RNA}-, and
	{DNA}-binding proteins. {U}sing a protein's amino acid composition
	and limited range correlation of hydrophobicity and solvent accessible
	surface area as input, each of the {SVM}s predicts whether the protein
	belongs to one of the three classes. {I}n self-consistency and cross-validation
	tests, which measures the success of learning and prediction, respectively,
	the r{RNA}-binding {SVM} has consistently achieved >95% accuracy.
	{T}he {RNA}- and {DNA}-binding {SVM}s demonstrate more diverse accuracy,
	ranging from approximately 76% to approximately 97%. {A}nalysis of
	the test results suggests the directions of improving the {SVM}s.},
  doi = {10.1016/S1570-9639(03)00112-2},
  pdf = {../local/Cai2003Supportd.pdf},
  file = {Cai2003Supportd.pdf:local/Cai2003Supportd.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S1570-9639(03)00112-2}
}
@article{Cai2003Supporta,
  author = {Cai, Y.D. and Lin, S.L. and Chou, K.C.},
  title = {Support vector machines for prediction of protein signal sequences
	and their cleavage sites},
  journal = {Peptides},
  year = {2003},
  volume = {24},
  pages = {159-161},
  number = {1},
  abstract = {Given a nascent protein sequence, how can one predict its signal peptide
	or "{Z}ipcode" sequence? {T}his is an important problem for scientists
	to use signal peptides as a vehicle to find new drugs or to reprogram
	cells for gene therapy (see, e.g. [7] {K}.{C}. {C}hou, {C}urrent
	{P}rotein and {P}eptide {S}cience 2002;3:615?22). {I}n this paper,
	support vector machines ({SVM}s), a new machine learning method,
	is applied to approach this problem. {T}he overall rate of correct
	prediction for 1939 secretary proteins and 1440 nonsecretary proteins
	was over 91%. {I}t has not escaped our attention that the new method
	may also serve as a useful tool for further investigating many unclear
	details regarding the molecular mechanism of the {ZIP} code protein-sorting
	system in cells.},
  doi = {10.1016/S0196-9781(02)00289-9},
  pdf = {../local/Cai2003Supporta.pdf},
  file = {Cai2003Supporta.pdf:local/Cai2003Supporta.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Cai2003Prediction,
  author = {Cai, Y.D. and Liu, X.J. and Li, Y.X. and Xu, X.B. and Chou, K.C.},
  title = {Prediction of beta-turns with learning machines.},
  journal = {Peptides},
  year = {2003},
  volume = {24},
  pages = {665-669},
  number = {5},
  abstract = {The support vector machine approach was introduced to predict the
	beta-turns in proteins. {T}he overall self-consistency rate by the
	re-substitution test for the training or learning dataset reached
	100%. {B}oth the training dataset and independent testing dataset
	were taken from {C}hou [{J}. {P}ept. {R}es. 49 (1997) 120]. {T}he
	success prediction rates by the jackknife test for the beta-turn
	subset of 455 tetrapeptides and non-beta-turn subset of 3807 tetrapeptides
	in the training dataset were 58.1 and 98.4%, respectively. {T}he
	success rates with the independent dataset test for the beta-turn
	subset of 110 tetrapeptides and non-beta-turn subset of 30,231 tetrapeptides
	were 69.1 and 97.3%, respectively. {T}he results obtained from this
	study support the conclusion that the residue-coupled effect along
	a tetrapeptide is important for the formation of a beta-turn.},
  doi = {10.1016/S0196-9781(03)00133-5},
  pdf = {../local/Cai2003Prediction.pdf},
  file = {Cai2003Prediction.pdf:local/Cai2003Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0196-9781(03)00133-5}
}
@article{Cai2003Supportb,
  author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.},
  title = {Support vector machines for prediction of protein domain structural
	class.},
  journal = {J. {T}heor. {B}iol.},
  year = {2003},
  volume = {221},
  pages = {115-120},
  number = {1},
  abstract = {The support vector machines ({SVM}s) method was introduced for predicting
	the structural class of protein domains. {T}he results obtained through
	the self-consistency test, jack-knife test, and independent dataset
	test have indicated that the current method and the elegant component-coupled
	algorithm developed by {C}hou and co-workers, if effectively complemented
	with each other, may become a powerful tool for predicting the structural
	class of protein domains.},
  doi = {10.1006/jtbi.2003.3179},
  pdf = {../local/Cai2003Supportb.pdf},
  file = {Cai2003Supportb.pdf:local/Cai2003Supportb.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1006/jtbi.2003.3179}
}
@article{Cai2002Supporta,
  author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.},
  title = {Support {V}ector {M}achines for predicting {HIV} protease cleavage
	sites in protein.},
  journal = {J. {C}omput. {C}hem.},
  year = {2002},
  volume = {23},
  pages = {267-274},
  number = {2},
  abstract = {Knowledge of the polyprotein cleavage sites by {HIV} protease will
	refine our understanding of its specificity, and the information
	thus acquired is useful for designing specific and efficient {HIV}
	protease inhibitors. {T}he pace in searching for the proper inhibitors
	of {HIV} protease will be greatly expedited if one can find an accurate,
	robust, and rapid method for predicting the cleavage sites in proteins
	by {HIV} protease. {I}n this article, a {S}upport {V}ector {M}achine
	is applied to predict the cleavability of oligopeptides by proteases
	with multiple and extended specificity subsites. {W}e selected {HIV}-1
	protease as the subject of the study. {T}wo hundred ninety-nine oligopeptides
	were chosen for the training set, while the other 63 oligopeptides
	were taken as a test set. {B}ecause of its high rate of self-consistency
	(299/299 = 100%), a good result in the jackknife test (286/299 95%)
	and correct prediction rate (55/63 = 87%), it is expected that the
	{S}upport {V}ector {M}achine method can be referred to as a useful
	assistant technique for finding effective inhibitors of {HIV} protease,
	which is one of the targets in designing potential drugs against
	{AIDS}. {T}he principle of the {S}upport {V}ector {M}achine method
	can also be applied to analyzing the specificity of other multisubsite
	enzymes.},
  doi = {10.1002/jcc.10017},
  pdf = {../local/local},
  file = {local:local/:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/jcc.10017}
}
@article{Cai2002Supportb,
  author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.},
  title = {Support vector machines for predicting the specificity of {{G}al{NA}c}-transferase},
  journal = {Peptides},
  year = {2002},
  volume = {23},
  pages = {205-208},
  abstract = {Support {V}ector {M}achines ({SVM}s) which is one kind of learning
	machines, was applied to predict the specificity of {G}al{NA}c-transferase.
	{T}he examination for the self-consistency and the jackknife test
	of the {SVM}s method were tested for the training dataset (305 oligopeptides),
	the correct rate of self-consistency and jackknife test reaches 100%
	and 84.9%, respectively. {F}urthermore, the prediction of the independent
	testing dataset (30 oligopeptides) was tested, the rate reaches 76.67%.},
  doi = {10.1016/S0196-9781(01)00597-6},
  pdf = {../local/Cai2002Supportb.pdf},
  file = {Cai2002Supportb.pdf:local/Cai2002Supportb.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0196-9781(01)00597-6}
}
@article{Cai2002Supportc,
  author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.},
  title = {Support vector machines for the classification and prediction of
	beta-turn types},
  journal = {J. {P}ept. {S}ci.},
  year = {2002},
  volume = {8},
  pages = {297-301},
  number = {7},
  abstract = {The support vector machines ({SVM}s) method is proposed because it
	can reflect the sequence-coupling effect for a tetrapeptide in not
	only a beta-turn or non-beta-turn, but also in different types of
	beta-turn. {T}he results of the model for 6022 tetrapeptides indicate
	that the rates of self-consistency for beta-turn types {I}, {I}',
	{II}, {II}', {VI} and {VIII} and non-beta-turns are 99.92%, 96.8%,
	98.02%, 97.75%, 100%, 97.19% and 100%, respectively. {U}sing these
	training data, the rate of correct prediction by the {SVM}s for a
	given protein: rubredoxin (54 residues. 51 tetrapeptides) which includes
	12 beta-turn type {I} tetrapeptides, 1 beta-turn type {II} tetrapeptide
	and 38 non-beta-turns reached 82.4%. {T}he high quality of prediction
	of the {SVM}s implies that the formation of different beta-turn types
	or non-beta-turns is considerably correlated with the sequence of
	a tetrapeptide. {T}he {SVM}s can save {CPU} time and avoid the overfitting
	problem compared with the neural network method.},
  doi = {10.1002/psc.401},
  pdf = {../local/local},
  file = {local:local/:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/psc.401}
}
@article{Cai2000Support,
  author = {Cai, Y.D. and Liu, X.J. and Xu, X.B. and Chou, K.C.},
  title = {Support vector machines for prediction of protein subcellular location},
  journal = {Mol. {C}ell {B}iol. {R}es. {C}ommun.},
  year = {2000},
  volume = {4},
  pages = {230-234},
  number = {4},
  abstract = {Support {V}ector {M}achine ({SVM}), which is one kind of learning
	machines, was applied to predict the subcellular location of proteins
	from their amino acid composition. {I}n this research, the proteins
	are classified into the following 12 groups: (1) chloroplast, (2)
	cytoplasm, (3) cytoskeleton, (4) endoplasmic reticulum, (5) extracall,
	(6) {G}olgi apparatus, (7) lysosome, (8) mitochondria, (9) nucleus,
	(10) peroxisome, (11) plasma membrane, and (12) vacuole, which have
	covered almost all the organelles and subcellular compartments in
	an animal or plant cell. {T}he examination for the self-consistency
	and the jackknife test of the {SVM}s method was tested for the three
	sets: 2022 proteins, 2161 proteins, and 2319 proteins. {A}s a result,
	the correct rate of self-consistency and jackknife test reaches 91
	and 82% for 2022 proteins, 89 and 75% for 2161 proteins, and 85 and
	73% for 2319 proteins, respectively. {F}urthermore, the predicting
	rate was tested by the three independent testing datasets containing
	2240 proteins, 2513 proteins, and 2591 proteins. {T}he correct prediction
	rates reach 82, 75, and 73% for 2240 proteins, 2513 proteins, and
	2591 proteins, respectively.},
  doi = {10.1006/mcbr.2001.0285},
  pdf = {../local/local},
  file = {local:local/:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1006/mcbr.2001.0285}
}
@article{Cai2004Application,
  author = {Cai, Y.D. and Ricardo, P.W. and Jen, C.H. and Chou, K.C.},
  title = {Application of {SVM} to predict membrane protein types.},
  journal = {J. {T}heor. {B}iol.},
  year = {2004},
  volume = {226},
  pages = {373-376},
  number = {4},
  abstract = {As a continuous effort to develop automated methods for predicting
	membrane protein types that was initiated by {C}hou and {E}lrod ({PROTEINS}:
	{S}tructure, {F}unction, and {G}enetics, 1999, 34, 137-153), the
	support vector machine ({SVM}) is introduced. {R}esults obtained
	through re-substitution, jackknife, and independent data set tests,
	respectively, have indicated that the {SVM} approach is quite a promising
	one, suggesting that the covariant discriminant algorithm ({C}hou
	and {E}lrod, {P}rotein {E}ng. 12 (1999) 107) and {SVM}, if effectively
	complemented with each other, will become a powerful tool for predicting
	membrane protein types and the other protein attributes as well.},
  doi = {10.1016/j.jtbi.2003.08.015},
  pdf = {../local/Cai2004Application.pdf},
  file = {Cai2004Application.pdf:local/Cai2004Application.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.jtbi.2003.08.015}
}
@article{Cai2004Identify,
  author = {Cai, Y.D. and Zhou, G.P. and Jen, C.H. and Lin, S.L. and Chou, K.C.},
  title = {Identify catalytic triads of serine hydrolases by support vector
	machines.},
  journal = {J. {T}heor. {B}iol.},
  year = {2004},
  volume = {228},
  pages = {551-557},
  number = {4},
  abstract = {The core of an enzyme molecule is its active site from the viewpoints
	of both academic research and industrial application. {T}o reveal
	the structural and functional mechanism of an enzyme, one needs to
	know its active site; to conduct structure-based drug design by regulating
	the function of an enzyme, one needs to know the active site and
	its microenvironment as well. {G}iven the atomic coordinates of an
	enzyme molecule, how can we predict its active site? {T}o tackle
	such a problem, a distance group approach was proposed and the support
	vector machine algorithm applied to predict the catalytic triad of
	serine hydrolase family. {T}he success rate by jackknife test for
	the 139 serine hydrolases was 85%, implying that the method is quite
	promising and may become a useful tool in structural bioinformatics.},
  doi = {10.1016/j.jtbi.2004.02.019},
  pdf = {../local/Cai2004Identify.pdf},
  file = {Cai2004Identify.pdf:local/Cai2004Identify.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.jtbi.2004.02.019}
}
@article{Cai2004Prediction,
  author = {Yu-Dong Cai and Andrew J Doig},
  title = {Prediction of {S}accharomyces cerevisiae protein functional class
	from functional domain composition.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {1292-300},
  number = {8},
  month = {May},
  abstract = {M{OTIVATION}: {A} key goal of genomics is to assign function to genes,
	especially for orphan sequences. {RESULTS}: {W}e compared the clustered
	functional domains in the {SBASE} database to each protein sequence
	using {BLASTP}. {T}his representation for a protein is a vector,
	where each of the non-zero entries in the vector indicates a significant
	match between the sequence of interest and the {SBASE} domain. {T}he
	machine learning methods nearest neighbour algorithm ({NNA}) and
	support vector machines are used for predicting protein functional
	classes from this information. {W}e find that the best results are
	found using the {SBASE}-{A} database and the {NNA}, namely 72\% accuracy
	for 79\% coverage. {W}e tested an assigning function based on searching
	for {I}nter{P}ro sequence motifs and by taking the most significant
	{BLAST} match within the dataset. {W}e applied the functional domain
	composition method to predict the functional class of 2018 currently
	unclassified yeast open reading frames. {AVAILABILITY}: {A} program
	for the prediction method, that uses {NNA} called {F}unctional {C}lass
	{P}rediction based on {F}unctional {D}omains ({FCPFD}) is available
	and can be obtained by contacting {Y}.{D}.{C}ai at y.cai@umist.ac.uk},
  doi = {10.1093/bioinformatics/bth085},
  pdf = {../local/Cai2004Prediction.pdf},
  file = {Cai2004Prediction.pdf:local/Cai2004Prediction.pdf:PDF},
  keywords = {biosvm},
  pii = {bth085},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth085}
}
@article{Cai2002Support,
  author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Chou, K.-C.},
  title = {Support vector machines for prediction of protein subcellular location
	by incorporating quasi-sequence-order effect},
  journal = {J. {C}ell. {B}iochem.},
  year = {2002},
  volume = {84},
  pages = {343-348},
  number = {2},
  abstract = {Support {V}ector {M}achine ({SVM}), which is one class of learning
	machines, was applied to predict the subcellular location of proteins
	by incorporating the quasi-sequence-order effect ({C}hou [2000] {B}iochem.
	{B}iophys. {R}es. {C}ommun. 278:477-483). {I}n this study, the proteins
	are classified into the following 12 groups: (1) chloroplast, (2)
	cytoplasm, (3) cytoskeleton, (4) endoplasmic reticulum, (5) extracellular,
	(6) {G}olgi apparatus, (7) lysosome, (8) mitochondria, (9) nucleus,
	(10) peroxisome, (11) plasma membrane, and (12) vacuole, which account
	for most organelles and subcellular compartments in an animal or
	plant cell. {E}xaminations for self-consistency and jackknife testing
	of the {SVM}s method were conducted for three sets consisting of
	1,911, 2,044, and 2,191 proteins. {T}he correct rates for self-consistency
	and the jackknife test values achieved with these protein sets were
	94 and 83% for 1,911 proteins, 92 and 78% for 2,044 proteins, and
	89 and 75% for 2,191 proteins, respectively. {F}urthermore, tests
	for correct prediction rates were undertaken with three independent
	testing datasets containing 2,148 proteins, 2,417 proteins, and 2,494
	proteins producing values of 84, 77, and 74%, respectively.},
  doi = {10.1002/jcb.10030},
  pdf = {../local/Cai2002Support.pdf},
  file = {Cai2002Support.pdf:local/Cai2002Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/jcb.10030}
}
@article{Cai2002Prediction,
  author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Zhou, G.-P.},
  title = {Prediction of protein structural classes by support vector machines.},
  journal = {Comput. {C}hem.},
  year = {2002},
  volume = {26},
  pages = {293-296},
  number = {3},
  abstract = {In this paper, we apply a new machine learning method which is called
	support vector machine to approach the prediction of protein structural
	class. {T}he support vector machine method is performed based on
	the database derived from {SCOP} which is based upon domains of known
	structure and the evolutionary relationships and the principles that
	govern their 3{D} structure. {A}s a result, high rates of both self-consistency
	and jackknife test are obtained. {T}his indicates that the structural
	class of a protein inconsiderably correlated with its amino and composition,
	and the support vector machine can be referred as a powerful computational
	tool for predicting the structural classes of proteins.},
  doi = {10.1016/S0097-8485(01)00113-9},
  pdf = {../local/Cai2002Prediction.pdf},
  file = {Cai2002Prediction.pdf:local/Cai2002Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0097-8485(01)00113-9}
}
@article{Cai2001Support,
  author = {Cai, Y.-D. and Liu, X.-J. and Xu, X.-B. and Zhou, G.-P.},
  title = {Support {V}ector {M}achines for predicting protein structural class},
  journal = {B{MC} {B}ioinformatics},
  year = {2001},
  volume = {2},
  pages = {3},
  number = {3},
  abstract = {Background {W}e apply a new machine learning method, the so-called
	{S}upport {V}ector {M}achine method, to predict the protein structural
	class. {S}upport {V}ector {M}achine method is performed based on
	the database derived from {SCOP}, in which protein domains are classified
	based on known structures and the evolutionary relationships and
	the principles that govern their 3-{D} structure. {R}esults {H}igh
	rates of both self-consistency and jackknife tests are obtained.
	{T}he good results indicate that the structural class of a protein
	is considerably correlated with its amino acid composition. {C}onclusions
	{I}t is expected that the {S}upport {V}ector {M}achine method and
	the elegant component-coupled method, also named as the covariant
	discrimination algorithm, if complemented with each other, can provide
	a powerful computational tool for predicting the structural classes
	of proteins.},
  doi = {10.1186/1471-2105-2-3},
  pdf = {../local/Cai2001Support.pdf},
  file = {Cai2001Support.pdf:local/Cai2001Support.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/2/3/abstract}
}
@article{Cai2003Support,
  author = {Cai, Y.-D. and Zhou, G.-P. and Chou, K.-C.},
  title = {Support {V}ector {M}achines for {P}redicting {M}embrane {P}rotein
	{T}ypes by {U}sing {F}unctional {D}omain {C}omposition},
  journal = {Biophys. {J}.},
  year = {2003},
  volume = {84},
  pages = {3257-3263},
  number = {5},
  abstract = {Membrane proteins are generally classified into the following five
	types: 1), type {I} membrane protein; 2), type {II} membrane protein;
	3), multipass transmembrane proteins; 4), lipid chain-anchored membrane
	proteins; and 5), {GPI}-anchored membrane proteins. {I}n this article,
	based on the concept of using the functional domain composition to
	define a protein, the {S}upport {V}ector {M}achine algorithm is developed
	for predicting the membrane protein type. {H}igh success rates are
	obtained by both the self-consistency and jackknife tests. {T}he
	current approach, complemented with the powerful covariant discriminant
	algorithm based on the pseudo-amino acid composition that has incorporated
	quasi-sequence-order effect as recently proposed by {K}. {C}. {C}hou
	(2001), may become a very useful high-throughput tool in the area
	of bioinformatics and proteomics.},
  pdf = {../local/Cai2003Support.pdf},
  file = {Cai2003Support.pdf:local/Cai2003Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.biophysj.org/cgi/content/abstract/84/5/3257}
}
@article{Camps-Valls2004Profiled,
  author = {Camps-Valls, G. and Chalk, A.M. and Serrano-Lopez, A.J. and Martin-Guerrero,
	J.D. and Sonnhammer, E.L.},
  title = {Profiled support vector machines for antisense oligonucleotide efficacy
	prediction.},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  pages = {135},
  number = {135},
  abstract = {Background {T}his paper presents the use of {S}upport {V}ector {M}achines
	({SVM}s) for prediction and analysis of antisense oligonucleotide
	({AO}) efficacy. {T}he collected database comprises 315 {AO} molecules
	including 68 features each, inducing a problem well-suited to {SVM}s.
	{T}he task of feature selection is crucial given the presence of
	noisy or redundant features, and the well-known problem of the curse
	of dimensionality. {W}e propose a two-stage strategy to develop an
	optimal model: (1) feature selection using correlation analysis,
	mutual information, and {SVM}-based recursive feature elimination
	({SVM}-{RFE}), and (2) {AO} prediction using standard and profiled
	{SVM} formulations. {A} profiled {SVM} gives different weights to
	different parts of the training data to focus the training on the
	most important regions. {R}esults {I}n the first stage, the {SVM}-{RFE}
	technique was most efficient and robust in the presence of low number
	of samples and high input space dimension. {T}his method yielded
	an optimal subset of 14 representative features, which were all related
	to energy and sequence motifs. {T}he second stage evaluated the performance
	of the predictors (overall correlation coefficient between observed
	and predicted efficacy, r; mean error, {ME}; and root-mean-square-error,
	{RMSE}) using 8-fold and minus-one-{RNA} cross-validation methods.
	{T}he profiled {SVM} produced the best results (r = 0.44, {ME} =
	0.022, and {RMSE}= 0.278) and predicted high (>75% inhibition of
	gene expression) and low efficacy (<25%) {AO}s with a success rate
	of 83.3% and 82.9%, respectively, which is better than by previous
	approaches. {A} web server for {AO} prediction is available online
	at http://aosvm.cgb.ki.se/. {C}onclusions {T}he {SVM} approach is
	well suited to the {AO} prediction problem, and yields a prediction
	accuracy superior to previous methods. {T}he profiled {SVM} was found
	to perform better than the standard {SVM}, suggesting that it could
	lead to improvements in other prediction problems as well.},
  doi = {10.1186/1471-2105-5-135},
  pdf = {../local/Camps-Valls2004Profiled.pdf},
  file = {Camps-Valls2004Profiled.pdf:local/Camps-Valls2004Profiled.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.biomedcentral.com/1471-2105/5/135}
}
@article{Capriotti2005I-Mutant,
  author = {Capriotti, E. and Fariselli, P. and Casadio, R.},
  title = {I-{M}utant2.0: predicting stability changes upon mutation from the
	protein sequence or structure.},
  journal = {Nucleic {A}cids {R}es.},
  year = {2005},
  volume = {33},
  pages = {W306-10},
  number = {Web Server issue},
  month = {Jul},
  abstract = {I-{M}utant2.0 is a support vector machine ({SVM})-based tool for the
	automatic prediction of protein stability changes upon single point
	mutations. {I}-{M}utant2.0 predictions are performed starting either
	from the protein structure or, more importantly, from the protein
	sequence. {T}his latter task, to the best of our knowledge, is exploited
	for the first time. {T}he method was trained and tested on a data
	set derived from {P}ro{T}herm, which is presently the most comprehensive
	available database of thermodynamic experimental data of free energy
	changes of protein stability upon mutation under different conditions.
	{I}-{M}utant2.0 can be used both as a classifier for predicting the
	sign of the protein stability change upon mutation and as a regression
	estimator for predicting the related {D}elta{D}elta{G} values. {A}cting
	as a classifier, {I}-{M}utant2.0 correctly predicts (with a cross-validation
	procedure) 80\% or 77\% of the data set, depending on the usage of
	structural or sequence information, respectively. {W}hen predicting
	{D}elta{D}elta{G} values associated with mutations, the correlation
	of predicted with expected/experimental values is 0.71 (with a standard
	error of 1.30 kcal/mol) and 0.62 (with a standard error of 1.45 kcal/mol)
	when structural or sequence information are respectively adopted.
	{O}ur web interface allows the selection of a predictive mode that
	depends on the availability of the protein structure and/or sequence.
	{I}n this latter case, the web server requires only pasting of a
	protein sequence in a raw format. {W}e therefore introduce {I}-{M}utant2.0
	as a unique and valuable helper for protein design, even when the
	protein structure is not yet known with atomic resolution. {A}vailability:
	http://gpcr.biocomp.unibo.it/cgi/predictors/{I}-{M}utant2.0/{I}-{M}utant2.0.cgi.},
  doi = {10.1093/nar/gki375},
  pdf = {../local/local},
  file = {local:local/:PDF},
  keywords = {biosvm},
  pii = {33/suppl_2/W306},
  url = {http://dx.doi.org/10.1093/nar/gki375}
}
@article{Carter2001computational,
  author = {Carter, R. J. and Dubchak, I. and Holbrook, S. R.},
  title = {A computational approach to identify genes for functional {{RNA}s}
	in genomic sequences},
  journal = {Nucl. {A}cids {R}es.},
  year = {2001},
  volume = {29},
  pages = {3928-3938},
  number = {19},
  abstract = {Currently there is no successful computational approach for identification
	of genes encoding novel functional {RNA}s (f{RNA}s) in genomic sequences.
	{W}e have developed a machine learning approach using neural networks
	and support vector machines to extract common features among known
	{RNA}s for prediction of new {RNA} genes in the unannotated regions
	of prokaryotic and archaeal genomes. {T}he {E}scherichia coli genome
	was used for development, but we have applied this method to several
	other bacterial and archaeal genomes. {N}etworks based on nucleotide
	composition were 80-90% accurate in jackknife testing experiments
	for bacteria and 90-99% for hyperthermophilic archaea. {W}e also
	achieved a significant improvement in accuracy by combining these
	predictions with those obtained using a second set of parameters
	consisting of known {RNA} sequence motifs and the calculated free
	energy of folding. {S}everal known f{RNA}s not included in the training
	datasets were identified as well as several hundred predicted novel
	{RNA}s. {T}hese studies indicate that there are many unidentified
	{RNA}s in simple genomes that can be predicted computationally as
	a precursor to experimental study. {P}ublic access to our {RNA} gene
	predictions and an interface for user predictions is available via
	the web.},
  pdf = {../local/Carter2001computational.pdf},
  file = {Carter2001computational.pdf:local/Carter2001computational.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://nar.oupjournals.org/cgi/content/abstract/29/19/3928}
}
@article{Cavalieri2005,
  author = {Cavalieri, D. and De Filippo, C.},
  title = {Bioinformatic methods for integrating whole-genome expression results
	into cellular networks},
  journal = {Drug {D}iscov {T}oday},
  year = {2005},
  volume = {10},
  pages = {727-34},
  number = {10},
  abstract = {Extracting a comprehensive overview from the huge amount of information
	arising from whole-genome analyses is a significant challenge. {T}his
	review critically surveys the state of the art methods that are used
	to connect information from functional genomic studies to biological
	function. {C}luster analysis methods for inferring the correlation
	between genes are discussed, as are the methods for integrating gene
	expression information with existing information on biological pathways
	and the methods that combine cluster analysis with biological information
	to reconstruct novel biological networks.},
  keywords = {Cluster Analysis *Computational Biology/methods/organization & administration/trends
	*Genomics/methods/organization & administration/trends Humans Oligonucleotide
	Array Sequence Analysis/methods}
}
@article{Chan2003Detection,
  author = {Ian Chan and William Wells and Robert V Mulkern and Steven Haker
	and Jianqing Zhang and Kelly H Zou and Stephan E Maier and Clare
	M C Tempany},
  title = {Detection of prostate cancer by integration of line-scan diffusion,
	{T}2-mapping and {T}2-weighted magnetic resonance imaging; a multichannel
	statistical classifier.},
  journal = {Med {P}hys},
  year = {2003},
  volume = {30},
  pages = {2390-8},
  number = {9},
  month = {Sep},
  abstract = {A multichannel statistical classifier for detecting prostate cancer
	was developed and validated by combining information from three different
	magnetic resonance ({MR}) methodologies: {T}2-weighted, {T}2-mapping,
	and line scan diffusion imaging ({LSDI}). {F}rom these {MR} sequences,
	four different sets of image intensities were obtained: {T}2-weighted
	({T}2{W}) from {T}2-weighted imaging, {A}pparent {D}iffusion {C}oefficient
	({ADC}) from {LSDI}, and proton density ({PD}) and {T}2 ({T}2 {M}ap)
	from {T}2-mapping imaging. {M}anually segmented tumor labels from
	a radiologist, which were validated by biopsy results, served as
	tumor "ground truth." {T}extural features were extracted from the
	images using co-occurrence matrix ({CM}) and discrete cosine transform
	({DCT}). {A}natomical location of voxels was described by a cylindrical
	coordinate system. {A} statistical jack-knife approach was used to
	evaluate our classifiers. {S}ingle-channel maximum likelihood ({ML})
	classifiers were based on 1 of the 4 basic image intensities. {O}ur
	multichannel classifiers: support vector machine ({SVM}) and {F}isher
	linear discriminant ({FLD}), utilized five different sets of derived
	features. {E}ach classifier generated a summary statistical map that
	indicated tumor likelihood in the peripheral zone ({PZ}) of the prostate
	gland. {T}o assess classifier accuracy, the average areas under the
	receiver operator characteristic ({ROC}) curves over all subjects
	were compared. {O}ur best {FLD} classifier achieved an average {ROC}
	area of 0.839(+/-0.064), and our best {SVM} classifier achieved an
	average {ROC} area of 0.761(+/-0.043). {T}he {T}2{W} {ML} classifier,
	our best single-channel classifier, only achieved an average {ROC}
	area of 0.599(+/-0.146). {C}ompared to the best single-channel {ML}
	classifier, our best multichannel {FLD} and {SVM} classifiers have
	statistically superior {ROC} performance ({P}=0.0003 and 0.0017,
	respectively) from pairwise two-sided t-test. {B}y integrating the
	information from multiple images and capturing the textural and anatomical
	features in tumor areas, summary statistical maps can potentially
	aid in image-guided prostate biopsy and assist in guiding and controlling
	delivery of localized therapy under image guidance.},
  pdf = {../local/Chan2003Detection.pdf},
  file = {Chan2003Detection.pdf:local/Chan2003Detection.pdf:PDF},
  keywords = {Algorithms, Anion Exchange Resins, Antigen-Antibody Complex, Artificial
	Intelligence, Automated, Automatic Data Processing, Biological, Blood
	Cells, Chemical, Chromatography, Cluster Analysis, Comparative Study,
	Computational Biology, Computer Simulation, Computer-Assisted, Data
	Interpretation, Databases, Decision Making, Decision Trees, Diffusion
	Magnetic Resonance Imaging, English Abstract, Epitopes, Expert Systems,
	Factual, Fuzzy Logic, Gene Expression Profiling, Gene Expression
	Regulation, Gene Targeting, Genome, Histocompatibility Antigens Class
	I, Humans, Image Interpretation, Image Processing, In Vitro, Indicators
	and Reagents, Information Storage and Retrieval, Ion Exchange, Least-Squares
	Analysis, Liver Cirrhosis, Magnetic Resonance Imaging, Male, Models,
	Neural Networks (Computer), Non-P.H.S., Non-U.S. Gov't, Nonl, Nucleic
	Acid Conformation, P.H.S., Pattern Recognition, Pro, Prostatic Neoplasms,
	Protein, Protein Binding, Protein Interaction Mapping, Proteins,
	Quantitative Structure-Activity Relationship, RNA, ROC Curve, Reproducibility
	of Results, Research Support, Sensitivity and Specificity, Sequence
	Analysis, Severity of Illness Index, Statistical, Structure-Activity
	Relationship, Subtraction Technique, T-Lymphocyte, Transcription
	Factors, Transfer, Treatment Outcome, U.S. Gov't, User-Computer Interface,
	inear Dynamics, teome, 14528961}
}
@article{Chan2002Comparison,
  author = {Kwokleung Chan and Te-Won Lee and Pamela A Sample and Michael H Goldbaum
	and Robert N Weinreb and Terrence J Sejnowski},
  title = {Comparison of machine learning and traditional classifiers in glaucoma
	diagnosis.},
  journal = {I{EEE} {T}rans {B}iomed {E}ng},
  year = {2002},
  volume = {49},
  pages = {963-74},
  number = {9},
  month = {Sep},
  abstract = {Glaucoma is a progressive optic neuropathy with characteristic structural
	changes in the optic nerve head reflected in the visual field. {T}he
	visual-field sensitivity test is commonly used in a clinical setting
	to evaluate glaucoma. {S}tandard automated perimetry ({SAP}) is a
	common computerized visual-field test whose output is amenable to
	machine learning. {W}e compared the performance of a number of machine
	learning algorithms with {STATPAC} indexes mean deviation, pattern
	standard deviation, and corrected pattern standard deviation. {T}he
	machine learning algorithms studied included multilayer perceptron
	({MLP}), support vector machine ({SVM}), and linear ({LDA}) and quadratic
	discriminant analysis ({QDA}), {P}arzen window, mixture of {G}aussian
	({MOG}), and mixture of generalized {G}aussian ({MGG}). {MLP} and
	{SVM} are classifiers that work directly on the decision boundary
	and fall under the discriminative paradigm. {G}enerative classifiers,
	which first model the data probability density and then perform classification
	via {B}ayes' rule, usually give deeper insight into the structure
	of the data space. {W}e have applied {MOG}, {MGG}, {LDA}, {QDA},
	and {P}arzen window to the classification of glaucoma from {SAP}.
	{P}erformance of the various classifiers was compared by the areas
	under their receiver operating characteristic curves and by sensitivities
	(true-positive rates) at chosen specificities (true-negative rates).
	{T}he machine-learning-type classifiers showed improved performance
	over the best indexes from {STATPAC}. {F}orward-selection and backward-elimination
	methodology further improved the classification rate and also has
	the potential to reduce testing time by diminishing the number of
	visual-field location measurements.},
  doi = {10.1109/TBME.2002.802012},
  pdf = {../local/Chan2002Comparison.pdf},
  file = {Chan2002Comparison.pdf:local/Chan2002Comparison.pdf:PDF},
  keywords = {Acute, Algorithms, Animals, Anion Exchange Resins, Artificial Intelligence,
	Automated, Base Pair Mismatch, Base Pairing, Base Sequence, Biological,
	Biosensing Techniques, Carcinoma, Chemical, Chromatography, Citric
	Acid Cycle, Classification, Cluster Analysis, Comparative Study,
	Computational Biology, Computer-Assisted, Cystadenoma, DNA, Databases,
	Decision Making, Diagnosis, Differential, Discriminant Analysis,
	Drug, Drug Design, Electrostatics, Epitopes, Eukaryotic Cells, Factual,
	False Negative Reactions, False Positive Reactions, Feasibility Studies,
	Female, Gene Expression, Gene Expression Profiling, Gene Expression
	Regulation, Genes, Genetic, Genetic Heterogeneity, Genetic Markers,
	Glaucoma, HLA Antigens, Hemolysins, Histocompatibility Antigens Class
	I, Humans, Internet, Intraocular Pressure, Ion Exchange, Lasers,
	Leukemia, Ligands, Likelihood Functions, Logistic Models, Lung Neoplasms,
	Lymphocytic, Lymphoma, Markov Chains, Mathematics, Messenger, Models,
	Molecular, Molecular Probe Techniques, Molecular Sequence Data, Nanotechnology,
	Neoplasm, Neoplasms, Neoplastic, Neural Networks (Computer), Neurological,
	Non-P.H.S., Non-Small-Cell Lung, Non-U.S. Gov't, Nucleic Acid Conformation,
	Nucleic Acid Hybridization, Observer Variation, Oligonucleotide Array
	Sequence Analysis, Open-Angle, Ophthalmoscopy, Optic Disk, Optic
	Nerve Diseases, Ovarian Neoplasms, P.H.S., Pattern Recognition, Peptides,
	Perimetry, Predictive Value of Tests, Probability, Probability Learning,
	Protein, Protein Binding, Protein Conformation, Proteins, Quality
	Control, Quantum Theory, RNA, RNA Splicing, ROC Curve, Receptors,
	Reference Values, Regression Analysis, Reproducibility of Results,
	Research Support, Robotics, Saccharomyces cerevisiae Proteins, Sensitivity
	and Specificity, Sequence Analysis, Signal Processing, Software,
	Statistical, Stomach Neoplasms, Structural, Structure-Activity Relationship,
	T-Lymphocyte, Thermodynamics, Transcription, Tumor Markers, U.S.
	Gov't, 12214886},
  url = {http://dx.doi.org/10.1109/TBME.2002.802012}
}
@article{Chen2004Prediction,
  author = {Chen, Y.C. and Lin, Y.S. and Lin, C.J. and Hwang, J.K.},
  title = {Prediction of the bonding states of cysteines using the support vector
	machines based on multiple feature vectors and cysteine state sequences},
  journal = {Proteins},
  year = {2004},
  volume = {55},
  pages = {1036-1042},
  number = {4},
  abstract = {The support vector machine ({SVM}) method is used to predict the bonding
	states of cysteines. {B}esides using local descriptors such as the
	local sequences, we include global information, such as amino acid
	compositions and the patterns of the states of cysteines (bonded
	or nonbonded), or cysteine state sequences, of the proteins. {W}e
	found that {SVM} based on local sequences or global amino acid compositions
	yielded similar prediction accuracies for the data set comprising
	4136 cysteine-containing segments extracted from 969 nonhomologous
	proteins. {H}owever, the {SVM} method based on multiple feature vectors
	(combining local sequences and global amino acid compositions) significantly
	improves the prediction accuracy, from 80% to 86%. {I}f coupled with
	cysteine state sequences, {SVM} based on multiple feature vectors
	yields 90% in overall prediction accuracy and a 0.77 {M}atthews correlation
	coefficient, around 10% and 22% higher than the corresponding values
	obtained by {SVM} based on local sequence information.},
  doi = {10.1002/prot.20079},
  pdf = {../local/Chen2004Prediction.pdf},
  file = {Chen2004Prediction.pdf:local/Chen2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/Chen2004Prediction.pdf}
}
@article{Chen2005Understanding,
  author = {Chen, Y. and Xu, D.},
  title = {Understanding protein dispensability through machine-learning analysis
	of high-throughput data},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {575-581},
  month = {Mar},
  abstract = {Motivation: {P}rotein dispensability is fundamental to understanding
	of gene function and evolution. {R}ecent advances in generating high-throughput
	data such as genomic sequence data, protein-protein interaction data,
	gene-expression data, and growth-rate data of mutants allow us to
	investigate protein dispensability systematically at the genome scale.{R}esults:
	{I}n our studies, protein dispensability is represented as a fitness
	score that is measured by the growth rate of gene-deletion mutants.
	{T}hrough analyses of high-throughput data in yeast {S}accharomyces
	cerevisia, we found that a protein's dispensability had significant
	correlations with its evolutionary rate and duplication rate, as
	well as its connectivity in protein-protein interaction network and
	gene-expression correlation network. {N}eural network and support
	vector machine were applied to predict protein dispensability through
	high-throughput data. {O}ur studies shed some lights on global characteristics
	of protein dispensability and evolution.{A}vailability: {T}he original
	datasets for protein dispensability analysis and prediction, together
	with related scripts, are available at http://digbio.missouri.edu/~ychen/{P}ro{D}ispen/.},
  doi = {10.1093/bioinformatics/bti058},
  pdf = {../local/Chen2005Understanding.pdf},
  file = {Chen2005Understanding.pdf:local/Chen2005Understanding.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti058}
}
@article{Chou2001Prediction,
  author = {Chou, K.-C.},
  title = {Prediction of protein signal sequences and their cleavage sites},
  journal = {Protein. {S}truct. {F}unct. {G}enet.},
  year = {2001},
  volume = {42},
  pages = {136--139},
  pdf = {../local/chou01.pdf},
  file = {chou01.pdf:local/chou01.pdf:PDF},
  subject = {bioprot},
  url = {http://www3.interscience.wiley.com/cgi-bin/abstract/75504759/START}
}
@article{Chou2001Using,
  author = {Chou, K.-C.},
  title = {Using subsite coupling to predict signal peptides},
  journal = {Protein {E}ng.},
  year = {2001},
  volume = {14},
  pages = {75--79},
  number = {2},
  pdf = {../local/chou01b.pdf},
  file = {chou01b.pdf:local/chou01b.pdf:PDF},
  subject = {bioprot},
  url = {http://protein.oupjournals.org/cgi/content/abstract/14/2/75}
}
@article{Chou2002Using,
  author = {Chou, K.-C. and Cai, Y.-D.},
  title = {Using {F}unctional {D}omain {C}omposition and {S}upport {V}ector
	{M}achines for {P}rediction of {P}rotein {S}ubcellular {L}ocation},
  journal = {J. {B}iol. {C}hem.},
  year = {2002},
  volume = {277},
  pages = {45765-45769},
  number = {48},
  abstract = {Proteins are generally classified into the following 12 subcellular
	locations: 1) chloroplast, 2) cytoplasm, 3) cytoskeleton, 4) endoplasmic
	reticulum, 5) extracellular, 6) {G}olgi apparatus, 7) lysosome, 8)
	mitochondria, 9) nucleus, 10) peroxisome, 11) plasma membrane, and
	12) vacuole. {B}ecause the function of a protein is closely correlated
	with its subcellular location, with the rapid increase in new protein
	sequences entering into databanks, it is vitally important for both
	basic research and pharmaceutical industry to establish a high throughput
	tool for predicting protein subcellular location. {I}n this paper,
	a new concept, the so-called "functional domain composition" is introduced.
	{B}ased on the novel concept, the representation for a protein can
	be defined as a vector in a high-dimensional space, where each of
	the clustered functional domains derived from the protein universe
	serves as a vector base. {W}ith such a novel representation for a
	protein, the support vector machine ({SVM}) algorithm is introduced
	for predicting protein subcellular location. {H}igh success rates
	are obtained by the self-consistency test, jackknife test, and independent
	dataset test, respectively. {T}he current approach not only can play
	an important complementary role to the powerful covariant discriminant
	algorithm based on the pseudo amino acid composition representation
	({C}hou, {K}. {C}. (2001) {P}roteins {S}truct. {F}unct. {G}enet.
	43, 246-255; {C}orrection (2001) {P}roteins {S}truct. {F}unct. {G}enet.
	44, 60), but also may greatly stimulate the development of this area.},
  pdf = {../local/Chou2002Using.pdf},
  file = {Chou2002Using.pdf:local/Chou2002Using.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.jbc.org/cgi/content/abstract/277/48/45765}
}
@article{Chow2001Identifying,
  author = {M. L. Chow and E. J. Moler and I. S. Mian},
  title = {Identifying marker genes in transcription profiling data using a
	mixture of feature relevance experts.},
  journal = {Physiol. {G}enomics},
  year = {2001},
  volume = {5},
  pages = {99-111},
  number = {2},
  month = {Mar},
  abstract = {Transcription profiling experiments permit the expression levels of
	many genes to be measured simultaneously. {G}iven profiling data
	from two types of samples, genes that most distinguish the samples
	(marker genes) are good candidates for subsequent in-depth experimental
	studies and developing decision support systems for diagnosis, prognosis,
	and monitoring. {T}his work proposes a mixture of feature relevance
	experts as a method for identifying marker genes and illustrates
	the idea using published data from samples labeled as acute lymphoblastic
	and myeloid leukemia ({ALL}, {AML}). {A} feature relevance expert
	implements an algorithm that calculates how well a gene distinguishes
	samples, reorders genes according to this relevance measure, and
	uses a supervised learning method [here, support vector machines
	({SVM}s)] to determine the generalization performances of different
	nested gene subsets. {T}he mixture of three feature relevance experts
	examined implement two existing and one novel feature relevance measures.
	{F}or each expert, a gene subset consisting of the top 50 genes distinguished
	{ALL} from {AML} samples as completely as all 7,070 genes. {T}he
	125 genes at the union of the top 50s are plausible markers for a
	prototype decision support system. {C}hromosomal aberration and other
	data support the prediction that the three genes at the intersection
	of the top 50s, cystatin {C}, azurocidin, and adipsin, are good targets
	for investigating the basic biology of {ALL}/{AML}. {T}he same data
	were employed to identify markers that distinguish samples based
	on their labels of {T} cell/{B} cell, peripheral blood/bone marrow,
	and male/female. {S}elenoprotein {W} may discriminate {T} cells from
	{B} cells. {R}esults from analysis of transcription profiling data
	from tumor/nontumor colon adenocarcinoma samples support the general
	utility of the aforementioned approach. {T}heoretical issues such
	as choosing {SVM} kernels and their parameters, training and evaluating
	feature relevance experts, and the impact of potentially mislabeled
	samples on marker identification (feature selection) are discussed.},
  pdf = {../local/Chow2001Identifying.pdf},
  file = {Chow2001Identifying.pdf:local/Chow2001Identifying.pdf:PDF},
  keywords = {biosvm},
  pii = {5/2/99},
  url = {http://physiolgenomics.physiology.org/cgi/content/abstract/5/2/99}
}
@article{Churchill2002Fundamentals,
  author = {Churchill, G. A.},
  title = {Fundamentals of experimental design for cDNA microarrays},
  journal = {Nat. Genet.},
  year = {2002},
  volume = {32 Suppl},
  pages = {490--495},
  month = {Dec},
  abstract = {Microarray technology is now widely available and is being applied
	to address increasingly complex scientific questions. Consequently,
	there is a greater demand for statistical assessment of the conclusions
	drawn from microarray experiments. This review discusses fundamental
	issues of how to design an experiment to ensure that the resulting
	data are amenable to statistical analysis. The discussion focuses
	on two-color spotted cDNA microarrays, but many of the same issues
	apply to single-color gene-expression assays as well.},
  doi = {10.1038/ng1031},
  institution = {The Jackson Laboratory, 600 Main Street, Bar Harbor, ME 04609, USA.
	garyc@jax.org},
  keywords = {Animals; DNA, Complementary, analysis; Gene Expression; Gene Expression
	Profiling, methods; Mice; Models, Biological; Oligonucleotide Array
	Sequence Analysis, methods; Reference Standards; Reproducibility
	of Results; Research Design; Statistics as Topic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {phupe},
  pii = {ng1031},
  pmid = {12454643},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1038/ng1031}
}
@article{Cole2005Comparing,
  author = {Jason C Cole and Christopher W Murray and J. Willem M Nissink and
	Richard D Taylor and Robin Taylor},
  title = {Comparing protein-ligand docking programs is difficult.},
  journal = {Proteins},
  year = {2005},
  volume = {60},
  pages = {325--332},
  number = {3},
  month = {Aug},
  abstract = {There is currently great interest in comparing protein-ligand docking
	programs. A review of recent comparisons shows that it is difficult
	to draw conclusions of general applicability. Statistical hypothesis
	testing is required to ensure that differences in pose-prediction
	success rates and enrichment rates are significant. Numerical measures
	such as root-mean-square deviation need careful interpretation and
	may profitably be supplemented by interaction-based measures and
	visual inspection of dockings. Test sets must be of appropriate diversity
	and of good experimental reliability. The effects of crystal-packing
	interactions may be important. The method used for generating starting
	ligand geometries and positions may have an appreciable effect on
	docking results. For fair comparison, programs must be given search
	problems of equal complexity (e.g. binding-site regions of the same
	size) and approximately equal time in which to solve them. Comparisons
	based on rescoring require local optimization of the ligand in the
	space of the new objective function. Re-implementations of published
	scoring functions may give significantly different results from the
	originals. Ostensibly minor details in methodology may have a profound
	influence on headline success rates.},
  doi = {10.1002/prot.20497},
  institution = {Cambridge Crystallographic Data Centre, Cambridge, United Kingdom.},
  keywords = {Algorithms; Artificial Intelligence; Binding Sites; Computational
	Biology, methods; Computer Simulation; Crystallization; Crystallography,
	X-Ray; Databases, Protein; Ligands; Models, Molecular; Molecular
	Structure; Programming Languages; Protein Binding; Proteins, chemistry;
	Proteomics, methods; Reproducibility of Results; Software},
  owner = {bricehoffmann},
  pmid = {15937897},
  timestamp = {2009.02.13},
  url = {http://dx.doi.org/10.1002/prot.20497}
}
@article{Collier2004Comparison,
  author = {Nigel Collier and Koichi Takeuchi},
  title = {Comparison of character-level and part of speech features for name
	recognition in biomedical texts.},
  journal = {J {B}iomed {I}nform},
  year = {2004},
  volume = {37},
  pages = {423-35},
  number = {6},
  month = {Dec},
  abstract = {The immense volume of data which is now available from experiments
	in molecular biology has led to an explosion in reported results
	most of which are available only in unstructured text format. {F}or
	this reason there has been great interest in the task of text mining
	to aid in fact extraction, document screening, citation analysis,
	and linkage with large gene and gene-product databases. {I}n particular
	there has been an intensive investigation into the named entity ({NE})
	task as a core technology in all of these tasks which has been driven
	by the availability of high volume training sets such as the {GENIA}
	v3.02 corpus. {D}espite such large training sets accuracy for biology
	{NE} has proven to be consistently far below the high levels of performance
	in the news domain where {F} scores above 90 are commonly reported
	which can be considered near to human performance. {W}e argue that
	it is crucial that more rigorous analysis of the factors that contribute
	to the model's performance be applied to discover where the underlying
	limitations are and what our future research direction should be.
	{O}ur investigation in this paper reports on variations of two widely
	used feature types, part of speech ({POS}) tags and character-level
	orthographic features, and makes a comparison of how these variations
	influence performance. {W}e base our experiments on a proven state-of-the-art
	model, support vector machines using a high quality subset of 100
	annotated {MEDLINE} abstracts. {E}xperiments reveal that the best
	performing features are orthographic features with {F} score of 72.6.
	{A}lthough the {B}rill tagger trained in-domain on the {GENIA} v3.02p
	{POS} corpus gives the best overall performance of any {POS} tagger,
	at an {F} score of 68.6, this is still significantly below the orthographic
	features. {I}n combination these two features types appear to interfere
	with each other and degrade performance slightly to an {F} score
	of 72.3.},
  doi = {10.1016/j.jbi.2004.08.008},
  pdf = {../local/Collier2004Comparison.pdf},
  file = {Collier2004Comparison.pdf:local/Collier2004Comparison.pdf:PDF},
  keywords = {biosvm nlp},
  pii = {S1532-0464(04)00088-7},
  url = {http://dx.doi.org/10.1016/j.jbi.2004.08.008}
}
@article{Coupez2006Docking,
  author = {B. Coupez and R. A. Lewis},
  title = {Docking and scoring--theoretically easy, practically impossible?},
  journal = {Curr. Med. Chem.},
  year = {2006},
  volume = {13},
  pages = {2995--3003},
  number = {25},
  abstract = {Structure-based Drug Design (SBDD) is an essential part of the modern
	medicinal chemistry, and has led to the acceleration of many projects,
	and even to drugs on the market. Programs that perform docking and
	scoring of ligands to receptors are powerful tools in the drug designer's
	armoury that enhance the process of SBDD. They are even deployed
	on the desktop of many bench chemists. It is timely to review the
	state of the art, to understand how good our docking programs are,
	and what are the issues. In this review we would like to provide
	a guide around the reliable aspects of docking and scoring and the
	associated pitfalls aiming at an audience of medicinal chemists rather
	than modellers. For convenience, we will divide the review into two
	parts: docking and scoring. Docking concerns the preparation of the
	receptor and the ligand(s), the sampling of conformational space
	and stereochemistry (if appropriate). Scoring concerns the evaluation
	of all of the ligand-receptor poses generated by docking. The two
	processes are not truly independent, and this will be discussed here
	in detail. The preparation of the receptor and ligand(s) before docking
	requires great care. For the receptor, issues of protonation, tautomerisation
	and hydration are key, and we will discuss current approaches to
	these issues. Even more important is the degree of sampling: can
	the algorithms reproduce what is observed experimentally? If they
	can, are the scoring algorithms good enough to recognise this pose
	as the best? Do the scores correlate with observed binding affinity?
	How does local knowledge of the target (for example hinge-binding
	to a kinase) affect the accuracy of the predictions? We will review
	the key findings from several evaluation studies and present conclusions
	about when and how to interpret and trust the results of docking
	and scoring. Finally, we will present an outline of some of the latest
	developments in the area of scoring functions.},
  institution = {Computer-Aided Drug Discovery, Novartis Institute for Biomedical
	Research, Switzerland.},
  keywords = {Cluster Analysis; Computational Biology, methods; Computer Simulation;
	Computer-Aided Design; Databases, Factual; Drug Design; Ligands;
	Models, Chemical; Software; Structure-Activity Relationship},
  owner = {bricehoffmann},
  pmid = {17073642},
  timestamp = {2009.02.13}
}
@article{Cuff1999Evaluation,
  author = {Cuff, J. A. and Barton, G. J.},
  title = {Evaluation and improvement of multiple sequence methods for protein
	secondary structure prediction},
  journal = {Protein. {S}truct. {F}unct. {G}enet.},
  year = {1999},
  volume = {34},
  pages = {508-519},
  pdf = {../local/cuff99.pdf},
  file = {cuff99.pdf:local/cuff99.pdf:PDF},
  subject = {biocasp},
  url = {http://www3.interscience.wiley.com/cgi-bin/fulltext/65000270/FILE?TPL=ft_start}
}
@article{Cui2004Esub8,
  author = {Cui, Q. and Jiang, T. and Liu, B. and Ma, S.},
  title = {Esub8: {A} novel tool to predict protein subcellular localizations
	in eukaryotic organisms},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  pages = {66},
  number = {66},
  abstract = {Background {S}ubcellular localization of a new protein sequence is
	very important and fruitful for understanding its function. {A}s
	the number of new genomes has dramatically increased over recent
	years, a reliable and efficient system to predict protein subcellular
	location is urgently needed. {R}esults {E}sub8 was developed to predict
	protein subcellular localizations for eukaryotic proteins based on
	amino acid composition. {I}n this research, the proteins are classified
	into the following eight groups: chloroplast, cytoplasm, extracellular,
	{G}olgi apparatus, lysosome, mitochondria, nucleus and peroxisome.
	{W}e know subcellular localization is a typical classification problem;
	consequently, a one-against-one (1-v-1) multi-class support vector
	machine was introduced to construct the classifier. {U}nlike previous
	methods, ours considers the order information of protein sequences
	by a different method. {O}ur method is tested in three subcellular
	localization predictions for prokaryotic proteins and four subcellular
	localization predictions for eukaryotic proteins on {R}einhardt's
	dataset. {T}he results are then compared to several other methods.
	{T}he total prediction accuracies of two tests are both 100% by a
	self-consistency test, and are 92.9% and 84.14% by the jackknife
	test, respectively. {E}sub8 also provides excellent results: the
	total prediction accuracies are 100% by a self-consistency test and
	87% by the jackknife test. {C}onclusions {O}ur method represents
	a different approach for predicting protein subcellular localization
	and achieved a satisfactory result; furthermore, we believe {E}sub8
	will be a useful tool for predicting protein subcellular localizations
	in eukaryotic organisms.},
  doi = {10.1186/1471-2105-5-66},
  pdf = {../local/Cui2004Esub8.pdf},
  file = {Cui2004Esub8.pdf:local/Cui2004Esub8.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/5/66}
}
@article{Cuturi2005context-tree,
  author = {Cuturi, M. and Vert, J.-P.},
  title = {The context-tree kernel for strings},
  journal = {Neural {N}etwork.},
  year = {2005},
  volume = {18},
  pages = {1111-1123},
  number = {4},
  abstract = {We propose a new kernel for strings which borrows ideas and techniques
	from information theory and data compression. {T}his kernel can be
	used in combination with any kernel method, in particular {S}upport
	{V}ector {M}achines for string classi- fication, with notable applications
	in proteomics. {B}y using a {B}ayesian averaging framework with conjugate
	priors on a class of {M}arkovian models known as prob- abilistic
	suffix trees or context-trees, we compute the value of this kernel
	in linear time and space while only using the information contained
	in the spectrum of the considered strings. {T}his is ensured through
	an adaptation of a compression method known as the context-tree weighting
	algorithm. {E}ncouraging classification results are reported on a
	standard protein homology detection experiment, showing that the
	context-tree kernel performs well with respect to other state-of-the-art
	methods while using no biological prior knowledge.},
  doi = {10.1016/j.neunet.2005.07.010},
  pdf = {../local/Cuturi2005context-tree.pdf},
  file = {Cuturi2005context-tree.pdf:local/Cuturi2005context-tree.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1016/j.neunet.2005.07.010}
}
@inproceedings{Cuturi2004mutual,
  author = {Cuturi, M. and Vert, J.-P.},
  title = {A mutual information kernel for strings},
  booktitle = {Proceedings of {IJCNN} 2004},
  year = {2004},
  pages = {1904-1910},
  pdf = {../local/ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF;ijcnn04.pdf:http\},
  file = {ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF;ijcnn04.pdf:http\://cg.ensmp.fr/~vert/publi/04ijcnn/ijcnn04.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Doennes2002Prediction,
  author = {Pierre D\"onnes and Arne Elofsson},
  title = {Prediction of {MHC} class {I} binding peptides, using {SVMHC}.},
  journal = {BMC Bioinformatics},
  year = {2002},
  volume = {3},
  pages = {25},
  month = {Sep},
  abstract = {BACKGROUND: T-cells are key players in regulating a specific immune
	response. Activation of cytotoxic T-cells requires recognition of
	specific peptides bound to Major Histocompatibility Complex (MHC)
	class I molecules. MHC-peptide complexes are potential tools for
	diagnosis and treatment of pathogens and cancer, as well as for the
	development of peptide vaccines. Only one in 100 to 200 potential
	binders actually binds to a certain MHC molecule, therefore a good
	prediction method for MHC class I binding peptides can reduce the
	number of candidate binders that need to be synthesized and tested.
	RESULTS: Here, we present a novel approach, SVMHC, based on support
	vector machines to predict the binding of peptides to MHC class I
	molecules. This method seems to perform slightly better than two
	profile based methods, SYFPEITHI and HLA_BIND. The implementation
	of SVMHC is quite simple and does not involve any manual steps, therefore
	as more data become available it is trivial to provide prediction
	for more MHC types. SVMHC currently contains prediction for 26 MHC
	class I types from the MHCPEP database or alternatively 6 MHC class
	I types from the higher quality SYFPEITHI database. The prediction
	models for these MHC types are implemented in a public web service
	available at http://www.sbc.su.se/svmhc/. CONCLUSIONS: Prediction
	of MHC class I binding peptides using Support Vector Machines, shows
	high performance and is easy to apply to a large number of MHC class
	I types. As more peptide data are put into MHC databases, SVMHC can
	easily be updated to give prediction for additional MHC class I types.
	We suggest that the number of binding peptides needed for SVM training
	is at least 20 sequences.},
  keywords = {Animals; Artificial Intelligence; Comparative Study; Computational
	Biology; Databases, Protein; Epitopes, T-Lymphocyte; HLA Antigens;
	Histocompatibility Antigens Class I; Humans; Peptides; Predictive
	Value of Tests; Protein Binding; Research Support, Non-U.S. Gov't;
	Sensitivity and Specificity},
  owner = {jacob},
  pmid = {12225620},
  timestamp = {2006.08.30}
}
@article{Davies2007Harnessing,
  author = {Matthew N Davies and Darren R Flower},
  title = {Harnessing bioinformatics to discover new vaccines.},
  journal = {Drug Discov Today},
  year = {2007},
  volume = {12},
  pages = {389--395},
  number = {9-10},
  month = {May},
  abstract = {Vaccine design is highly suited to the application of in silico techniques,
	for both the discovery and development of new and existing vaccines.
	Here, we discuss computational contributions to epitope mapping and
	reverse vaccinology, two techniques central to the new discipline
	of immunomics. Also discussed are methods to improve the efficiency
	of vaccination, such as codon optimization and adjuvant discovery
	in addition to the identification of allergenic proteins. We also
	review current software developed to facilitate vaccine design.},
  doi = {10.1016/j.drudis.2007.03.010},
  keywords = {Animals; Computational Biology; Drug Design; Epitope Mapping; Humans;
	Software Design; Vaccination; Vaccines},
  owner = {laurent},
  pii = {S1359-6446(07)00135-3},
  pmid = {17467575},
  timestamp = {2007.08.23},
  url = {http://dx.doi.org/10.1016/j.drudis.2007.03.010}
}
@article{Debouck1999DNA,
  author = {C. Debouck and P. N. Goodfellow},
  title = {{DNA} microarrays in drug discovery and development.},
  journal = {Nat. Genet.},
  year = {1999},
  volume = {21},
  pages = {48--50},
  number = {1 Suppl},
  month = {Jan},
  abstract = {DNA microarrays can be used to measure the expression patterns of
	thousands of genes in parallel, generating clues to gene function
	that can help to identify appropriate targets for therapeutic intervention.
	They can also be used to monitor changes in gene expression in response
	to drug treatments. Here, we discuss the different ways in which
	microarray analysis is likely to affect drug discovery.},
  doi = {10.1038/4475},
  keywords = {Agricultural, Alleles, Alternaria, Amino Acid, Amino Acid Chloromethyl
	Ketones, Amino Acid Sequence, Animal, Animals, Apoptosis, Asthma,
	Bacteria, Base Sequence, Binding Sites, Biotechnology, Blotting,
	Bone Density, Bone Matrix, Bone and Bones, CCR5, Camptothecin, Caspases,
	Cathepsins, Cell Surface, Central America, Chloroplast, Chondrocytes,
	Chromosome Mapping, Chromosomes, Cloning, Cluster Analysis, Collagen,
	Comparative Study, Coumarins, Crops, Crystallography, DNA, DNA Primers,
	Dipeptides, Disease, Disease Models, Drug Design, Drug Evaluation,
	Drug Industry, Enzyme Activation, Enzyme Inhibitors, Escherichia
	coli, Evolution, Exons, Expressed Sequence Tags, Female, Fetus, Fluorescent
	Dyes, Food Microbiology, Founder Effect, GTP-Binding Proteins, Gene
	Expression, Gene Frequency, Gene Library, Genes, Genetic, Genetic
	Predisposition to Disease, Genome, Geography, Growth Plate, Haplotypes,
	Hordeum, Human, Humans, Inclusion Bodies, Injections, Intraperitoneal,
	Introns, Isatin, Knockout, Male, Membrane Proteins, Messenger, Mice,
	Models, Molecular, Molecular Sequence Data, Molecular Structure,
	Mutation, Mycotoxins, Neutrophils, Non-U.S. Gov't, Northern, Oligonucleotide
	Array Sequence Analysis, Osteoarthritis, Osteochondrodysplasias,
	Osteoclasts, Osteopetrosis, Pair 15, Phaseolus, Polymorphism, Preclinical,
	Pregnancy, Promoter Regions (Genetics), Protein Precursors, Proteomics,
	RNA, Receptors, Recombinant Fusion Proteins, Recombinant Proteins,
	Research Support, Restriction Fragment Length, Ribosomal Proteins,
	Sequence Alignment, Sequence Analysis, Sequence Homology, South America,
	Species Specificity, Splenomegaly, Sulfonamides, Synteny, Tissue
	Distribution, Transcription, Trichothecenes, X-Ray, 9915501},
  owner = {piedro},
  pmid = {9915501},
  timestamp = {2006.08.11},
  url = {http://dx.doi.org/10.1038/4475}
}
@article{Degroeve2002Feature,
  author = {Degroeve, S. and De Baets, B. and Van de Peer, Y. and Rouze, P.},
  title = {Feature subset selection for splice site prediction},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {S75-S83},
  number = {Suppl. 1},
  abstract = {Motivation: {T}he large amount of available annotated {A}rabidopsis
	thaliana sequences allows the induction of splice site prediction
	models with supervised learning algorithms (see {H}aussler (1998)
	for a review and references). {T}hese algorithms need information
	sources or features from which the models can be computed. {F}or
	splice site prediction, the features we consider in this study are
	the presence or absence of certain nucleotides in close proximity
	to the splice site. {S}ince it is not known how many and which nucleotides
	are relevant for splice site prediction, the set of features is chosen
	large enough such that the probability that all relevant information
	sources are in the set is very high. {U}sing only those features
	that are relevant for constructing a splice site prediction system
	might improve the system and might also provide us with useful biological
	knowledge. {U}sing fewer features will of course also improve the
	prediction speed of the system. {R}esults: {A} wrapper-based feature
	subset selection algorithm using a support vector machine or a naive
	{B}ayes prediction method was evaluated against the traditional method
	for selecting features relevant for splice site prediction. {O}ur
	results show that this wrapper approach selects features that improve
	the performance against the use of all features and against the use
	of the features selected by the traditional method. {A}vailability:
	{T}he data and additional interactive graphs on the selected feature
	subsets are available at http://www.psb.rug.ac.be/gps {C}ontact:
	svgro@gengenp.rug.ac.be yvdp@gengenp.rug.ac.be},
  pdf = {../local/Degroeve2002Feature.pdf},
  file = {Degroeve2002Feature.pdf:local/Degroeve2002Feature.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/suppl_2/S75}
}
@article{Degroeve2005SpliceMachine,
  author = {Degroeve, S. and Saeys, Y. and De Baets, B. and Rouze, P. and Van
	de Peer, Y.},
  title = {{{S}plice{M}achine}: predicting splice sites from high-dimensional
	local context representations},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1332-1338},
  abstract = {Motivation: {I}n this age of complete genome sequencing, finding the
	location and structure of genes is crucial for further molecular
	research. {T}he accurate prediction of intron boundaries largely
	facilitates the correct prediction of gene structure in nuclear genomes.
	{M}any tools for localizing these boundaries on {DNA} sequences have
	been developed and are available to researchers through the internet.
	{N}evertheless, these tools still make many false positive predictions.{R}esults:
	{T}his manuscript presents a novel publicly available splice site
	prediction tool named {S}plice{M}achine that (i) shows state-of-the-art
	prediction performance on {A}rabidopsis thaliana and human sequences,
	(ii) performs a computationally fast annotation, and (iii) can be
	trained by the user on its own data.{A}vailability: {R}esults, figures
	and software are available at http://bioinformatics.psb.ugent.be/supplementary_data/.},
  doi = {10.1093/bioinformatics/bti166},
  pdf = {../local/Degroeve2005SpliceMachine.pdf},
  file = {Degroeve2005SpliceMachine.pdf:local/Degroeve2005SpliceMachine.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti166v1}
}
@inproceedings{Deshpande2002Evaluation,
  author = {Deshpande, M. and Karypis, G.},
  title = {Evaluation of {T}echniques for {C}lassifying {B}iological {S}equences},
  booktitle = {P{AKDD} '02: {P}roceedings of the 6th {P}acific-{A}sia {C}onference
	on {A}dvances in {K}nowledge {D}iscovery and {D}ata {M}ining},
  year = {2002},
  pages = {417--431},
  publisher = {Springer Verlag},
  abstract = {In recent years we have witnessed an exponential increase in the amount
	of biological information, either {DNA} or protein sequences, that
	has become available in public databases. {T}his has been followed
	by an increased interest in developing computational techniques to
	automatically classify these large volumes of sequence data into
	various categories corresponding to either their role in the chromosomes,
	their structure, and/or their function. {I}n this paper we evaluate
	some of the widely-used sequence classification algorithms and develop
	a framework for modeling sequences in a fashion so that traditional
	machine learning algorithms, such as support vector machines, can
	be applied easily. {O}ur detailed experimental evaluation shows that
	the {SVM}-based approaches are able to achieve higher classification
	accuracy compared to the more traditional sequence classification
	algorithms such as {M}arkov model based techniques and {K}-nearest
	neighbor based approaches.},
  pdf = {../local/Deshpande2002Evaluation.pdf},
  file = {Deshpande2002Evaluation.pdf:local/Deshpande2002Evaluation.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Dhingra2005Substantial,
  author = {Vikas Dhingra and Mukta Gupta and Tracy Andacht and Zhen F Fu},
  title = {New frontiers in proteomics research: a perspective.},
  journal = {Int. J. Pharm.},
  year = {2005},
  volume = {299},
  pages = {1--18},
  number = {1-2},
  month = {Aug},
  abstract = {Substantial advances have been made in the fundamental understanding
	of human biology, ranging from DNA structure to identification of
	diseases associated with genetic abnormalities. Genome sequence information
	is becoming available in unprecedented amounts. The absence of a
	direct functional correlation between gene transcripts and their
	corresponding proteins, however, represents a significant roadblock
	for improving the efficiency of biological discoveries. The success
	of proteomics depends on the ability to identify and analyze protein
	products in a cell or tissue and, this is reliant on the application
	of several key technologies. Proteomics is in its exponential growth
	phase. Two-dimensional electrophoresis complemented with mass spectrometry
	provides a global view of the state of the proteins from the sample.
	Proteins identification is a requirement to understand their functional
	diversity. Subtle difference in protein structure and function can
	contribute to complexity and diversity of life. This review focuses
	on the progress and the applications of proteomics science with special
	reference to integration of the evolving technologies involved to
	address biological questions.},
  doi = {10.1016/j.ijpharm.2005.04.010},
  institution = {Department of Pathology, University of Georgia, Athens, GA 30602,
	USA. vdhingra@vet.uga.edu},
  keywords = {Computational Biology; Electrophoresis, Gel, Two-Dimensional; Humans;
	Peptide Mapping; Protein Interaction Mapping; Proteomics; Spectrometry,
	Mass, Matrix-Assisted Laser Desorption-Ionization},
  owner = {ljacob},
  pii = {S0378-5173(05)00226-7},
  pmid = {15979831},
  timestamp = {2009.09.14},
  url = {http://dx.doi.org/10.1016/j.ijpharm.2005.04.010}
}
@article{Diekman2003Hybrid,
  author = {Casey Diekman and Wei He and Nagabhushana Prabhu and Harvey Cramer},
  title = {Hybrid methods for automated diagnosis of breast tumors.},
  journal = {Anal {Q}uant {C}ytol {H}istol},
  year = {2003},
  volume = {25},
  pages = {183-90},
  number = {4},
  month = {Aug},
  abstract = {O{BJECTIVE}: {T}o design and analyze a new family of hybrid methods
	for the diagnosis of breast tumors using fine needle aspirates. {STUDY}
	{DESIGN}: {W}e present a radically new approach to the design of
	diagnosis systems. {I}n the new approach, a nonlinear classifier
	with high sensitivity but low specificity is hybridized with a linear
	classifier having low sensitivity but high specificity. {D}ata from
	the {W}isconsin {B}reast {C}ancer {D}atabase are used to evaluate,
	computationally, the performance of the hybrid classifiers. {RESULTS}:
	{T}he diagnosis scheme obtained by hybridizing the nonlinear classifier
	ellipsoidal multisurface method ({EMSM}) with the linear classifier
	proximal support vector machine ({PSVM}) was found to have a mean
	sensitivity of 97.36\% and a mean specificity of 95.14\% and was
	found to yield a 2.44\% improvement in the reliability of positive
	diagnosis over that of {EMSM} at the expense of 0.4\% degradation
	in the reliability of negative diagnosis, again compared to {EMSM}.
	{A}t the 95\% confidence level we can trust the hybrid method to
	be 96.19-98.53\% correct in its malignant diagnosis of new tumors
	and 93.57-96.71\% correct in its benign diagnosis. {CONCLUSION}:
	{H}ybrid diagnosis schemes represent a significant paradigm shift
	and provide a promising new technique to improve the specificity
	of nonlinear classifiers without seriously affecting the high sensitivity
	of nonlinear classifiers.},
  keywords = {Algorithms, Amino Acid Sequence, Amino Acids, Anion Exchange Resins,
	Antigen-Antibody Complex, Artificial Intelligence, Automated, Automatic
	Data Processing, Benchmarking, Biological, Biological Markers, Biopsy,
	Blood Cells, Blood Proteins, Breast Neoplasms, Cell Line, Cellular
	Structures, Chemical, Chromatography, Chromosome Aberrations, Cluster
	Analysis, Colonic Neoplasms, Comparative Study, Computational Biology,
	Computer Simulation, Computer-Assisted, Computing Methodologies,
	DNA, Data Interpretation, Databases, Decision Making, Decision Trees,
	Diagnosis, Diffusion Magnetic Resonance Imaging, Disease, English
	Abstract, Epitopes, Expert Systems, Factual, Female, Fine-Needle,
	Fusion, Fuzzy Logic, Gene Expression Profiling, Gene Expression Regulation,
	Gene Targeting, Genetic, Genome, Histocompatibility Antigens Class
	I, Humans, Hydrogen Bonding, Hydrophobicity, Image Interpretation,
	Image Processing, In Vitro, Indicators and Reagents, Information
	Storage and Retrieval, Ion Exchange, Least-Squares Analysis, Leiomyosarcoma,
	Liver Cirrhosis, Lung Neoplasms, Magnetic Resonance Imaging, Male,
	Mass, Mathematical Computing, Matrix-Assisted Laser Desorption-Ionization,
	Models, Molecular, Molecular Sequence Data, Neoplasm Proteins, Neoplasms,
	Neoplastic, Nephroblastoma, Neural Networks (Computer), Non-P.H.S.,
	Non-U.S. Gov't, Nonl, Nucleic Acid Conformation, Nucleic Acid Hybridization,
	Oligonucleotide Array Sequence Analysis, Oncogene Proteins, Ovarian
	Neoplasms, P.H.S., Pattern Recognition, Predictive Value of Tests,
	Pro, Prostatic Neoplasms, Protein, Protein Binding, Protein Interaction
	Mapping, Protein Structure, Proteins, Quantitative Structure-Activity
	Relationship, RNA, ROC Curve, Reproducibility of Results, Research
	Support, Rhabdomyosarcoma, Secondary, Sensitivity and Specificity,
	Sequence Alignment, Sequence Analysis, Severity of Illness Index,
	Software, Solubility, Spectrometry, Statistical, Structure-Activity
	Relationship, Subcellular Fractions, Subtraction Technique, T-Lymphocyte,
	Tissue Distribution, Transcription Factors, Transfer, Treatment Outcome,
	Tumor, Tumor Markers, U.S. Gov't, User-Computer Interface, inear
	Dynamics, teome, 12961824}
}
@article{Dieterle2003Urinary,
  author = {Frank Dieterle and Silvia Müller-Hagedorn and Hartmut M Liebich
	and Günter Gauglitz},
  title = {Urinary nucleosides as potential tumor markers evaluated by learning
	vector quantization.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2003},
  volume = {28},
  pages = {265-79},
  number = {3},
  month = {Jul},
  abstract = {Modified nucleosides were recently presented as potential tumor markers
	for breast cancer. {T}he patterns of the levels of urinary nucleosides
	are different for tumor bearing individuals and for healthy individuals.
	{T}hus, a powerful pattern recognition method is needed. {A}lthough
	backpropagation ({BP}) neural networks are becoming increasingly
	common in medical literature for pattern recognition, it has been
	shown that often-superior methods exist like learning vector quantization
	({LVQ}) and support vector machines ({SVM}). {T}he aim of this feasibility
	study is to get an indication of the performance of urinary nucleoside
	levels evaluated by {LVQ} in contrast to the evaluation the popular
	{BP} and {SVM} networks. {U}rine samples were collected from female
	breast cancer patients and from healthy females. {T}welve different
	ribonucleosides were isolated and quantified by a high performance
	liquid chromatography ({HPLC}) procedure. {LVQ}, {SVM} and {BP} networks
	were trained and the performance was evaluated by the classification
	of the test sets into the categories "cancer" and "healthy". {A}ll
	methods showed a good classification with a sensitivity ranging from
	58.8 to 70.6\% at a specificity of 88.4-94.2\% for the test patterns.
	{A}lthough the classification performance of all methods is comparable,
	the {LVQ} implementations are superior in terms of more qualitative
	features: the results of {LVQ} networks are more reproducible, as
	the initialization is deterministic. {T}he {LVQ} networks can be
	trained by unbalanced sizes of the different classes. {LVQ} networks
	are fast during training, need only few parameters adjusted for training
	and can be retrained by patterns of "local individuals". {A}s at
	least some of these features play an important role in an implementation
	into a medical decision support system, it is recommended to use
	{LVQ} for an extended study.},
  doi = {10.1016/S0933-3657(03)00058-7},
  pdf = {../local/Dieterle2003Urinary.pdf},
  file = {Dieterle2003Urinary.pdf:local/Dieterle2003Urinary.pdf:PDF},
  keywords = {80 and over, Adnexal Diseases, Adult, Aged, Algorithms, Artificial
	Intelligence, Automated, Bayes Theorem, Biological, Breast Neoplasms,
	Case-Control Studies, Chromatography, Comparative Study, Computational
	Biology, Computer-Assisted, Diagnosis, Differential, Feasibility
	Studies, Female, High Pressure Liquid, Humans, Logistic Models, Middle
	Aged, Neural Networks (Computer), Non-U.S. Gov't, Nucleosides, Ovarian
	Neoplasms, Pattern Recognition, Predictive Value of Tests, ROC Curve,
	Reproducibility of Results, Research Support, Sensitivity and Specificity,
	Tumor Markers, 12927336},
  pii = {S0933365703000587},
  url = {http://dx.doi.org/10.1016/S0933-3657(03)00058-7}
}
@article{Ding2001Multi-class,
  author = {Ding, C.H.Q. and Dubchak, I.},
  title = {Multi-class protein fold recognition using support vector machines
	and neural networks},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {349--358},
  abstract = {Motivation: {P}rotein fold recognition is an important approach to
	structure discovery without relying on sequence similarity. {W}e
	study this approach with new multi-class classification methods and
	examined many issues important for a practical recognition system.
	{R}esults: {M}ost current discriminative methods for protein fold
	prediction use the one-against-others method, which has the well-known
	?{F}alse {P}ositives? problem. {W}e investigated two new methods:
	the unique one-against-others and the all-against-all methods. {B}oth
	improve prediction accuracy by 14?110% on a dataset containing 27
	{SCOP} folds. {W}e used the {S}upport {V}ector {M}achine ({SVM})
	and the {N}eural {N}etwork ({NN}) learning methods as base classifiers.
	{SVM}s converges fast and leads to high accuracy. {W}hen scores of
	multiple parameter datasets are combined, majority voting reduces
	noise and increases recognition accuracy. {W}e examined many issues
	involved with large number of classes, including dependencies of
	prediction accuracy on the number of folds and on the number of representatives
	in a fold. {O}verall, recognition systems achieve 56% fold prediction
	accuracy on a protein test dataset, where most of the proteins have
	below 25% sequence identity with the proteins used in training. {S}upplementary
	information: {T}he protein parameter datasets used in this paper
	are available online (http://www.nersc.gov/~cding/protein).},
  pdf = {../local/Ding2001Multi-class.pdf},
  file = {Ding2001Multi-class.pdf:local/Ding2001Multi-class.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/17/4/349.pdf}
}
@article{Ding2005Minimum,
  author = {Chris Ding and Hanchuan Peng},
  title = {Minimum redundancy feature selection from microarray gene expression
	data.},
  journal = {J {B}ioinform {C}omput {B}iol},
  year = {2005},
  volume = {3},
  pages = {185-205},
  number = {2},
  month = {Apr},
  abstract = {How to selecting a small subset out of the thousands of genes in microarray
	data is important for accurate classification of phenotypes. {W}idely
	used methods typically rank genes according to their differential
	expressions among phenotypes and pick the top-ranked genes. {W}e
	observe that feature sets so obtained have certain redundancy and
	study methods to minimize it. {W}e propose a minimum redundancy -
	maximum relevance ({MRMR}) feature selection framework. {G}enes selected
	via {MRMR} provide a more balanced coverage of the space and capture
	broader characteristics of phenotypes. {T}hey lead to significantly
	improved class predictions in extensive experiments on 6 gene expression
	data sets: {NCI}, {L}ymphoma, {L}ung, {C}hild {L}eukemia, {L}eukemia,
	and {C}olon. {I}mprovements are observed consistently among 4 classification
	methods: {N}aive {B}ayes, {L}inear discriminant analysis, {L}ogistic
	regression, and {S}upport vector machines. {SUPPLIMENTARY}: {T}he
	top 60 {MRMR} genes for each of the datasets are listed in http://crd.lbl.gov/~cding/{MRMR}/.
	{M}ore information related to {MRMR} methods can be found at http://www.hpeng.net/.},
  keywords = {Adult, Aged, Aging, Algorithms, Animals, Apoptosis, Artificial Intelligence,
	Automated, Biological, Bone Marrow, Breast Neoplasms, Classification,
	Cluster Analysis, Comparative Study, Computer Simulation, Computer-Assisted,
	Diagnosis, Dose-Response Relationship, Drug, Female, Foot, Gait,
	Gene Expression Profiling, Gene Expression Regulation, Gene Silencing,
	Genetic Vectors, Humans, Image Interpretation, Information Storage
	and Retrieval, Kidney, Liver, Logistic Models, Male, Messenger, Models,
	Myocardium, Neoplasms, Non-U.S. Gov't, Oligonucleotide Array Sequence
	Analysis, Pattern Recognition, Pharmaceutical Preparations, Polymerase
	Chain Reaction, Principal Component Analysis, Proteins, RNA, Rats,
	Reproducibility of Results, Research Support, Sensitivity and Specificity,
	Small Interfering, Sprague-Dawley, Statistical, Subcellular Fractions,
	Unknown Primary, 15852500},
  pii = {S0219720005001004}
}
@article{Dobson2005Predicting,
  author = {Dobson, P.D. and Doig, A.J.},
  title = {Predicting enzyme class from protein structure without alignments},
  journal = {J. {M}ol. {B}iol.},
  year = {2005},
  volume = {345},
  pages = {187-199},
  number = {1},
  month = {Jan},
  abstract = {Methods for predicting protein function from structure are becoming
	more important as the rate at which structures are solved increases
	more rapidly than experimental knowledge. {A}s a result, protein
	structures now frequently lack functional annotations. {T}he majority
	of methods for predicting protein function are reliant upon identifying
	a similar protein and transferring its annotations to the query protein.
	{T}his method fails when a similar protein cannot be identified,
	or when any similar proteins identified also lack reliable annotations.
	{H}ere, we describe a method that can assign function from structure
	without the use of algorithms reliant upon alignments. {U}sing simple
	attributes that can be calculated from any crystal structure, such
	as secondary structure content, amino acid propensities, surface
	properties and ligands, we describe each enzyme in a non-redundant
	set. {T}he set is split according to {E}nzyme {C}lassification ({EC})
	number. {W}e combine the predictions of one-class versus one-class
	support vector machine models to make overall assignments of {EC}
	number to an accuracy of 35% with the top-ranked prediction, rising
	to 60% accuracy with the top two ranks. {I}n doing so we demonstrate
	the utility of simple structural attributes in protein function prediction
	and shed light on the link between structure and function. {W}e apply
	our methods to predict the function of every currently unclassified
	protein in the {P}rotein {D}ata {B}ank.},
  doi = {10.1016/j.jmb.2004.10.024},
  pdf = {../local/Dobson2005Predicting.pdf},
  file = {Dobson2005Predicting.pdf:local/Dobson2005Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.jmb.2004.10.024}
}
@article{Dobson2003Distinguishing,
  author = {Dobson, P.D. and Doig, A.J.},
  title = {Distinguishing enzyme structures from non-enzymes without alignments},
  journal = {J. {M}ol. {B}iol.},
  year = {2003},
  volume = {330},
  pages = {771-783},
  number = {4},
  abstract = {The ability to predict protein function from structure is becoming
	increasingly important as the number of structures resolved is growing
	more rapidly than our capacity to study function. {C}urrent methods
	for predicting protein function are mostly reliant on identifying
	a similar protein of known function. {F}or proteins that are highly
	dissimilar or are only similar to proteins also lacking functional
	annotations, these methods fail. {H}ere, we show that protein function
	can be predicted as enzymatic or not without resorting to alignments.
	{W}e describe 1178 high-resolution proteins in a structurally non-redundant
	subset of the {P}rotein {D}ata {B}ank using simple features such
	as secondary-structure content, amino acid propensities, surface
	properties and ligands. {T}he subset is split into two functional
	groupings, enzymes and non-enzymes. {W}e use the support vector machine-learning
	algorithm to develop models that are capable of assigning the protein
	class. {V}alidation of the method shows that the function can be
	predicted to an accuracy of 77% using 52 features to describe each
	protein. {A}n adaptive search of possible subsets of features produces
	a simplified model based on 36 features that predicts at an accuracy
	of 80%. {W}e compare the method to sequence-based methods that also
	avoid calculating alignments and predict a recently released set
	of unrelated proteins. {T}he most useful features for distinguishing
	enzymes from non-enzymes are secondary-structure content, amino acid
	frequencies, number of disulphide bonds and size of the largest cleft.
	{T}his method is applicable to any structure as it does not require
	the identification of sequence or structural similarity to a protein
	of known function.},
  doi = {10.1016/S0022-2836(03)00628-4},
  pdf = {../local/Dobson2003Distinguishing.pdf},
  file = {Dobson2003Distinguishing.pdf:local/Dobson2003Distinguishing.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0022-2836(03)00628-4}
}
@article{Donaldson2003PreBIND,
  author = {Donaldson, I. and Martin, J. and de Bruijn, B. and Wolting, C. and
	Lay, V. and Tuekam, B. and Zhang, S. and Baskin, B. and Bader, G.D.
	and Michalickova, K. and Pawson, T. and Hogue, C.W.V. },
  title = {{{P}re{BIND}} and {T}extomy - mining the biomedical literature for
	protein-protein interactions using a support vector machine},
  journal = {B{MC} {B}ioinformatics},
  year = {2003},
  volume = {4},
  pages = {11},
  number = {1},
  month = {Mar},
  abstract = {Background {T}he majority of experimentally verified molecular interaction
	and biological pathway data are present in the unstructured text
	of biomedical journal articles where they are inaccessible to computational
	methods. {T}he {B}iomolecular interaction network database ({BIND})
	seeks to capture these data in a machine-readable format. {W}e hypothesized
	that the formidable task-size of backfilling the database could be
	reduced by using {S}upport {V}ector {M}achine technology to first
	locate interaction information in the literature. {W}e present an
	information extraction system that was designed to locate protein-protein
	interaction data in the literature and present these data to curators
	and the public for review and entry into {BIND}. {R}esults {C}ross-validation
	estimated the support vector machine's test-set precision, accuracy
	and recall for classifying abstracts describing interaction information
	was 92%, 90% and 92% respectively. {W}e estimated that the system
	would be able to recall up to 60% of all non-high throughput interactions
	present in another yeast-protein interaction database. {F}inally,
	this system was applied to a real-world curation problem and its
	use was found to reduce the task duration by 70% thus saving 176
	days. {C}onclusions {M}achine learning methods are useful as tools
	to direct interaction and pathway database back-filling; however,
	this potential can only be realized if these techniques are coupled
	with human review and entry into a factual database such as {BIND}.
	{T}he {P}re{BIND} system described here is available to the public
	at http://bind.ca. {C}urrent capabilities allow searching for human,
	mouse and yeast protein-interaction information.},
  doi = {10.1186/1471-2105-4-11},
  pdf = {../local/Donaldson2003PreBIND.pdf},
  file = {Donaldson2003PreBIND.pdf:local/Donaldson2003PreBIND.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/4/11/abstract}
}
@article{Dong2005Fast,
  author = {Jian-xiong Dong and Adam Krzyzak and Ching Y Suen},
  title = {Fast {SVM} training algorithm with decomposition on very large data
	sets.},
  journal = {I{EEE} {T}rans {P}attern {A}nal {M}ach {I}ntell},
  year = {2005},
  volume = {27},
  pages = {603-18},
  number = {4},
  month = {Apr},
  abstract = {Training a support vector machine on a data set of huge size with
	thousands of classes is a challenging problem. {T}his paper proposes
	an efficient algorithm to solve this problem. {T}he key idea is to
	introduce a parallel optimization step to quickly remove most of
	the nonsupport vectors, where block diagonal matrices are used to
	approximate the original kernel matrix so that the original problem
	can be split into hundreds of subproblems which can be solved more
	efficiently. {I}n addition, some effective strategies such as kernel
	caching and efficient computation of kernel matrix are integrated
	to speed up the training process. {O}ur analysis of the proposed
	algorithm shows that its time complexity grows linearly with the
	number of classes and size of the data set. {I}n the experiments,
	many appealing properties of the proposed algorithm have been investigated
	and the results show that the proposed algorithm has a much better
	scaling capability than {L}ibsvm, {SVM}light, and {SVMT}orch. {M}oreover,
	the good generalization performances on several large databases have
	also been achieved.},
  keywords = {Algorithms, Animals, Antibiotics, Antineoplastic, Artificial Intelligence,
	Automated, Automatic Data Processing, Butadienes, Chloroplasts, Comparative
	Study, Computer Simulation, Computer-Assisted, Database Management
	Systems, Databases, Diagnosis, Disinfectants, Dose-Response Relationship,
	Drug, Drug Toxicity, Electrodes, Electroencephalography, Ethylamines,
	Expert Systems, Factual, Feedback, Fungicides, Gene Expression Profiling,
	Genes, Genetic Markers, Humans, Image Enhancement, Image Interpretation,
	Implanted, Industrial, Information Storage and Retrieval, Kidney,
	Kidney Tubules, MEDLINE, Male, Mercuric Chloride, Microarray Analysis,
	Molecular Biology, Motor Cortex, Movement, Natural Language Processing,
	Neural Networks (Computer), Non-P.H.S., Non-U.S. Gov't, Numerical
	Analysis, Pattern Recognition, Plant Proteins, Predictive Value of
	Tests, Proteins, Proteome, Proximal, Puromycin Aminonucleoside, Rats,
	Reproducibility of Results, Research Support, Sensitivity and Specificity,
	Signal Processing, Sprague-Dawley, Subcellular Fractions, Terminology,
	Therapy, Time Factors, Toxicogenetics, U.S. Gov't, User-Computer
	Interface, 15794164}
}
@article{Doniger2002Predicting,
  author = {Doniger, S. and Hofmann, T. and Yeh, J.},
  title = {Predicting {CNS} permeability of drug molecules: comparison of neural
	network and support vector machine algorithms},
  journal = {J. {C}omput. {B}iol.},
  year = {2002},
  volume = {9},
  pages = {849-864},
  number = {6},
  abstract = {Two different machine-learning algorithms have been used to predict
	the blood-brain barrier permeability of different classes of molecules,
	to develop a method to predict the ability of drug compounds to penetrate
	the {CNS}. {T}he first algorithm is based on a multilayer perceptron
	neural network and the second algorithm uses a support vector machine.
	{B}oth algorithms are trained on an identical data set consisting
	of 179 {CNS} active molecules and 145 {CNS} inactive molecules. {T}he
	training parameters include molecular weight, lipophilicity, hydrogen
	bonding, and other variables that govern the ability of a molecule
	to diffuse through a membrane. {T}he results show that the support
	vector machine outperforms the neural network. {B}ased on over 30
	different validation sets, the {SVM} can predict up to 96% of the
	molecules correctly, averaging 81.5% over 30 test sets, which comprised
	of equal numbers of {CNS} positive and negative molecules. {T}his
	is quite favorable when compared with the neural network's average
	performance of 75.7% with the same 30 test sets. {T}he results of
	the {SVM} algorithm are very encouraging and suggest that a classification
	tool like this one will prove to be a valuable prediction approach.},
  doi = {10.1089/10665270260518317},
  pdf = {../local/Doniger2002Predicting.pdf},
  file = {Doniger2002Predicting.pdf:local/Doniger2002Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Dostie2007Chromosome,
  author = {Josée Dostie and Ye Zhan and Job Dekker},
  title = {Chromosome conformation capture carbon copy technology.},
  journal = {Curr Protoc Mol Biol},
  year = {2007},
  volume = {Chapter 21},
  pages = {Unit 21.14},
  month = {Oct},
  abstract = {Chromosome conformation capture (3C) is used to quantify physical
	DNA contacts in vivo at high resolution. 3C was first used in yeast
	to map the spatial chromatin organization of chromosome III, and
	in higher eukaryotes to demonstrate that genomic DNA elements regulate
	target genes by physically interacting with them. 3C has been widely
	adopted for small-scale analysis of functional chromatin interactions
	along (cis) or between (trans) chromosomes. For larger-scale applications,
	chromosome conformation capture carbon copy (5C) combines 3C with
	ligation-mediated amplification (LMA) to simultaneously quantify
	hundreds of thousands of physical DNA contacts by microarray or ultra-high-throughput
	DNA sequencing. 5C allows the mapping of extensive networks of physical
	interactions among large sets of genomic elements throughout the
	genome. Such networks can provide important biological insights,
	e.g., by identifying relationships between regulatory elements and
	their target genes. This unit describes 5C for large-scale analysis
	of cis- and trans-chromatin interactions in mammalian cells.},
  doi = {10.1002/0471142727.mb2114s80},
  institution = {University of Massachusetts Medical School, Worcester, Massachusetts,
	USA.},
  keywords = {Chromosomes, Artificial, Bacterial; Chromosomes, chemistry; DNA Primers,
	metabolism; Molecular Biology, methods; Nucleic Acid Conformation;
	Oligonucleotide Array Sequence Analysis; Polymerase Chain Reaction;
	Sequence Analysis, DNA; Templates, Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pmid = {18265398},
  timestamp = {2010.08.11},
  url = {http://dx.doi.org/10.1002/0471142727.mb2114s80}
}
@article{Dover2002Methylation,
  author = {Jim Dover and Jessica Schneider and Mary Anne Tawiah-Boateng and
	Adam Wood and Kimberly Dean and Mark Johnston and Ali Shilatifard},
  title = {Methylation of histone H3 by COMPASS requires ubiquitination of histone
	H2B by Rad6.},
  journal = {J Biol Chem},
  year = {2002},
  volume = {277},
  pages = {28368--28371},
  number = {32},
  month = {Aug},
  abstract = {The DNA of eukaryotes is wrapped around nucleosomes and packaged into
	chromatin. Covalent modifications of the histone proteins that comprise
	the nucleosome alter chromatin structure and have major effects on
	gene expression. Methylation of lysine 4 of histone H3 by COMPASS
	is required for silencing of genes located near chromosome telomeres
	and within the rDNA (Krogan, N. J, Dover, J., Khorrami, S., Greenblatt,
	J. F., Schneider, J., Johnston, M., and Shilatifard, A. (2002) J.
	Biol. Chem. 277, 10753-10755; Briggs, S. D., Bryk, M., Strahl, B.
	D., Cheung, W. L., Davie, J. K., Dent, S. Y., Winston, F., and Allis,
	C. D. (2001) Genes. Dev. 15, 3286-3295). To learn about the mechanism
	of histone methylation, we surveyed the genome of the yeast Saccharomyces
	cerevisiae for genes necessary for this process. By analyzing approximately
	4800 mutant strains, each deleted for a different non-essential gene,
	we discovered that the ubiquitin-conjugating enzyme Rad6 is required
	for methylation of lysine 4 of histone H3. Ubiquitination of histone
	H2B on lysine 123 is the signal for the methylation of histone H3,
	which leads to silencing of genes located near telomeres.},
  doi = {10.1074/jbc.C200348200},
  institution = {Department of Biochemistry, Saint Louis University School of Medicine,
	St. Louis, Missouri 63104, USA.},
  keywords = {DNA, Ribosomal, metabolism; Electrophoresis, Polyacrylamide Gel; Gene
	Silencing; Histones, metabolism; Ligases, metabolism; Lysine, metabolism;
	Methylation; Models, Biological; Mutation; Saccharomyces cerevisiae
	Proteins; Saccharomyces cerevisiae, genetics; Ubiquitin, metabolism;
	Ubiquitin-Conjugating Enzymes},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pii = {C200348200},
  pmid = {12070136},
  timestamp = {2010.11.23},
  url = {http://dx.doi.org/10.1074/jbc.C200348200}
}
@article{Doyle2005PlosBiol,
  author = {John Doyle and Marie Csete},
  title = {Motifs, control, and stability.},
  journal = {PLoS Biol},
  year = {2005},
  volume = {3},
  pages = {e392},
  number = {11},
  month = {Nov},
  doi = {10.1371/journal.pbio.0030392},
  institution = {Department of Control and Dynamical Systems, California Institute
	of Technology, Pasadena, California, United States of America. doyle@caltech.edu},
  keywords = {Amino Acid Motifs; Bacterial Physiological Phenomena; Bacterial Proteins,
	chemistry; Escherichia coli, metabolism; Genes, Bacterial; Genes,
	Plant; Glycolysis; Heat-Shock Proteins, chemistry; Models, Biological;
	Models, Theoretical; Molecular Chaperones, chemistry; Plant Proteins,
	chemistry; Protein Interaction Mapping; Protein Structure, Tertiary;
	Transcription Factors, chemistry; Transcription, Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pii = {05-PLBI-P-0948},
  pmid = {16277557},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1371/journal.pbio.0030392}
}
@article{Doytchinova2004Identifying,
  author = {Irini A Doytchinova and Pingping Guan and Darren R Flower},
  title = {Identifying human {MHC} supertypes using bioinformatic methods.},
  journal = {J. Immunol.},
  year = {2004},
  volume = {172},
  pages = {4314--4323},
  number = {7},
  month = {Apr},
  abstract = {Classification of MHC molecules into supertypes in terms of peptide-binding
	specificities is an important issue, with direct implications for
	the development of epitope-based vaccines with wide population coverage.
	In view of extremely high MHC polymorphism (948 class I and 633 class
	II HLA alleles) the experimental solution of this task is presently
	impossible. In this study, we describe a bioinformatics strategy
	for classifying MHC molecules into supertypes using information drawn
	solely from three-dimensional protein structure. Two chemometric
	techniques-hierarchical clustering and principal component analysis-were
	used independently on a set of 783 HLA class I molecules to identify
	supertypes based on structural similarities and molecular interaction
	fields calculated for the peptide binding site. Eight supertypes
	were defined: A2, A3, A24, B7, B27, B44, C1, and C4. The two techniques
	gave 77\% consensus, i.e., 605 HLA class I alleles were classified
	in the same supertype by both methods. The proposed strategy allowed
	"supertype fingerprints" to be identified. Thus, the A2 supertype
	fingerprint is Tyr(9)/Phe(9), Arg(97), and His(114) or Tyr(116);
	the A3-Tyr(9)/Phe(9)/Ser(9), Ile(97)/Met(97) and Glu(114) or Asp(116);
	the A24-Ser(9) and Met(97); the B7-Asn(63) and Leu(81); the B27-Glu(63)
	and Leu(81); for B44-Ala(81); the C1-Ser(77); and the C4-Asn(77).},
  keywords = {Alleles; Amino Acid Motifs; Binding Sites; Computational Biology;
	DNA Fingerprinting; HLA Antigens; HLA-A Antigens; HLA-B Antigens;
	HLA-C Antigens; Histocompatibility Antigens Class I; Histocompatibility
	Testing; Humans; Multigene Family; Protein Interaction Mapping},
  owner = {laurent},
  pmid = {15034046},
  timestamp = {2007.01.03}
}
@article{Dreiseitl2001comparison,
  author = {S. Dreiseitl and L. Ohno-Machado and H. Kittler and S. Vinterbo and
	H. Billhardt and M. Binder},
  title = {A comparison of machine learning methods for the diagnosis of pigmented
	skin lesions.},
  journal = {J {B}iomed {I}nform},
  year = {2001},
  volume = {34},
  pages = {28-36},
  number = {1},
  month = {Feb},
  abstract = {We analyze the discriminatory power of k-nearest neighbors, logistic
	regression, artificial neural networks ({ANN}s), decision tress,
	and support vector machines ({SVM}s) on the task of classifying pigmented
	skin lesions as common nevi, dysplastic nevi, or melanoma. {T}hree
	different classification tasks were used as benchmarks: the dichotomous
	problem of distinguishing common nevi from dysplastic nevi and melanoma,
	the dichotomous problem of distinguishing melanoma from common and
	dysplastic nevi, and the trichotomous problem of correctly distinguishing
	all three classes. {U}sing {ROC} analysis to measure the discriminatory
	power of the methods shows that excellent results for specific classification
	problems in the domain of pigmented skin lesions can be achieved
	with machine-learning methods. {O}n both dichotomous and trichotomous
	tasks, logistic regression, {ANN}s, and {SVM}s performed on about
	the same level, with k-nearest neighbors and decision trees performing
	worse.},
  doi = {10.1006/jbin.2001.1004},
  pdf = {../local/Dreiseitl2001comparison.pdf},
  file = {Dreiseitl2001comparison.pdf:local/Dreiseitl2001comparison.pdf:PDF},
  keywords = {Algorithms, Amino Acid Sequence, Artificial Intelligence, Biological,
	Cell Compartmentation, Comparative Study, Computer Simulation, Computer-Assisted,
	Decision Trees, Diagnosis, Discriminant Analysis, Humans, Logistic
	Models, Melanoma, Models, Neural Networks (Computer), Nevus, Non-U.S.
	Gov't, Organelles, P.H.S., Pigmented, Predictive Value of Tests,
	Proteins, Reproducibility of Results, Research Support, Skin Diseases,
	Skin Neoplasms, Skin Pigmentation, U.S. Gov't, 11376540},
  url = {http://dx.doi.org/10.1006/jbin.2001.1004}
}
@article{Driel2003new,
  author = {van Driel, M. and Cuelenaere, K. and Kemmeren, P.P.C.W. and Leunissen,
	J.A.M. and Brunner, H.G.},
  title = {A new web-based data mining tool for the identification of candidate
	genes for human genetic disorders.},
  journal = {Eur. J. Hum. Genet.},
  year = {2003},
  volume = {11},
  pages = {57--63},
  number = {1},
  month = {Jan},
  abstract = {To identify the gene underlying a human genetic disorder can be difficult
	and time-consuming. Typically, positional data delimit a chromosomal
	region that contains between 20 and 200 genes. The choice then lies
	between sequencing large numbers of genes, or setting priorities
	by combining positional data with available expression and phenotype
	data, contained in different internet databases. This process of
	examining positional candidates for possible functional clues may
	be performed in many different ways, depending on the investigator's
	knowledge and experience. Here, we report on a new tool called the
	GeneSeeker, which gathers and combines positional data and expression/phenotypic
	data in an automated way from nine different web-based databases.
	This results in a quick overview of interesting candidate genes in
	the region of interest. The GeneSeeker system is built in a modular
	fashion allowing for easy addition or removal of databases if required.
	Databases are searched directly through the web, which obviates the
	need for data warehousing. In order to evaluate the GeneSeeker tool,
	we analysed syndromes with known genesis. For each of 10 syndromes
	the GeneSeeker programme generated a shortlist that contained a significantly
	reduced number of candidate genes from the critical region, yet still
	contained the causative gene. On average, a list of 163 genes based
	on position alone was reduced to a more manageable list of 22 genes
	based on position and expression or phenotype information. We are
	currently expanding the tool by adding other databases. The GeneSeeker
	is available via the web-interface (http://www.cmbi.kun.nl/GeneSeeker/).},
  doi = {10.1038/sj.ejhg.5200918},
  institution = {Centre for Molecular and Biomolecular Informatics, University of
	Nijmegen, The Netherlands. M.vanDriel@cmbi.kun.nl},
  keywords = {Computational Biology; Databases, Genetic; Databases, Nucleic Acid;
	Gene Expression; Genetic Diseases, Inborn; Humans; Internet; Noonan
	Syndrome; Software},
  owner = {mordelet},
  pii = {5200918},
  pmid = {12529706},
  timestamp = {2010.09.27},
  url = {http://dx.doi.org/10.1038/sj.ejhg.5200918}
}
@article{Dror2005Accurate,
  author = {Dror, G. and Sorek, R. and Shamir, R.},
  title = {Accurate identification of alternatively spliced exons using support
	vector machine},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {897-901},
  number = {7},
  month = {Apr},
  abstract = {Motivation: {A}lternative splicing is a major component of the regulation
	acting on mammalian transcriptomes. {I}t is estimated that over half
	of all human genes have more than one splice variant. {P}revious
	studies have shown that alternatively spliced exons possess several
	features that distinguish them from constitutively spliced ones.
	{R}ecently, we have demonstrated that such features can be used to
	distinguish alternative from constitutive exons. {I}n the current
	study we use advanced machine learning methods to generate robust
	alternative exons classifier.{R}esults: {W}e extracted several hundred
	local sequence features of constitutive as well as alternative exons.
	{U}sing feature selection methods we find seven attributes that are
	dominant for the task of classification. {S}everal less informative
	features help to slightly increase the performance of the classifier.
	{T}he classifier achieves a true positive rate of 50% for a false
	positive rate of 0.5%. {T}his result enables one to reliably identify
	alternatively spliced exons in exon databases that are believed to
	be dominated by constitutive exons.{A}vailability: {U}pon request
	from the authors.},
  doi = {10.1093/bioinformatics/bti132},
  pdf = {../local/Dror2005Accurate.pdf},
  file = {Dror2005Accurate.pdf:local/Dror2005Accurate.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti132v1}
}
@article{Dubey2005Support,
  author = {Anshul Dubey and Matthew J Realff and Jay H Lee and Andreas S Bommarius},
  title = {Support vector machines for learning to identify the critical positions
	of a protein.},
  journal = {J {T}heor {B}iol},
  year = {2005},
  volume = {234},
  pages = {351-61},
  number = {3},
  month = {Jun},
  abstract = {A method for identifying the positions in the amino acid sequence,
	which are critical for the catalytic activity of a protein using
	support vector machines ({SVM}s) is introduced and analysed. {SVM}s
	are supported by an efficient learning algorithm and can utilize
	some prior knowledge about the structure of the problem. {T}he amino
	acid sequences of the variants of a protein, created by inducing
	mutations, along with their fitness are required as input data by
	the method to predict its critical positions. {T}o investigate the
	performance of this algorithm, variants of the beta-lactamase enzyme
	were created in silico using simulations of both mutagenesis and
	recombination protocols. {R}esults from literature on beta-lactamase
	were used to test the accuracy of this method. {I}t was also compared
	with the results from a simple search algorithm. {T}he algorithm
	was also shown to be able to predict critical positions that can
	tolerate two different amino acids and retain function.},
  doi = {10.1016/j.jtbi.2004.11.037},
  pdf = {../local/Dubey2005Support.pdf},
  file = {Dubey2005Support.pdf:local/Dubey2005Support.pdf:PDF},
  keywords = {biosvm},
  pii = {S0022-5193(04)00585-5},
  url = {http://dx.doi.org/10.1016/j.jtbi.2004.11.037}
}
@article{Donnes2002Prediction,
  author = {D{\"o}nnes, P. and Elofsson, A.},
  title = {Prediction of {MHC} class {I} binding peptides, using {SVMHC}},
  journal = {B{MC} {B}ioinformatics},
  year = {2002},
  volume = {3},
  pages = {25},
  number = {1},
  month = {Sep},
  abstract = {Background {T}-cells are key players in regulating a specific immune
	response. {A}ctivation of cytotoxic {T}-cells requires recognition
	of specific peptides bound to {M}ajor {H}istocompatibility {C}omplex
	({MHC}) class {I} molecules. {MHC}-peptide complexes are potential
	tools for diagnosis and treatment of pathogens and cancer, as well
	as for the development of peptide vaccines. {O}nly one in 100 to
	200 potential binders actually binds to a certain {MHC} molecule,
	therefore a good prediction method for {MHC} class {I} binding peptides
	can reduce the number of candidate binders that need to be synthesized
	and tested. {R}esults {H}ere, we present a novel approach, {SVMHC},
	based on support vector machines to predict the binding of peptides
	to {MHC} class {I} molecules. {T}his method seems to perform slightly
	better than two profile based methods, {SYFPEITHI} and {HLA}_{BIND}.
	{T}he implementation of {SVMHC} is quite simple and does not involve
	any manual steps, therefore as more data become available it is trivial
	to provide prediction for more {MHC} types. {SVMHC} currently contains
	prediction for 26 {MHC} class {I} types from the {MHCPEP} database
	or alternatively 6 {MHC} class {I} types from the higher quality
	{SYFPEITHI} database. {T}he prediction models for these {MHC} types
	are implemented in a public web service available at http://www.sbc.su.se/svmhc/.
	{C}onclusions {P}rediction of {MHC} class {I} binding peptides using
	{S}upport {V}ector {M}achines, shows high performance and is easy
	to apply to a large number of {MHC} class {I} types. {A}s more peptide
	data are put into {MHC} databases, {SVMHC} can easily be updated
	to give prediction for additional {MHC} class {I} types. {W}e suggest
	that the number of binding peptides needed for {SVM} training is
	at least 20 sequences.},
  doi = {10.1186/1471-2105-3-25},
  pdf = {../local/Donnes2002Prediction.pdf},
  file = {Donnes2002Prediction.pdf:local/Donnes2002Prediction.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/3/25/abstract}
}
@article{Donnes2005Integrated,
  author = {D{\"o}nnes, P. and Kohlbacher, O.},
  title = {Integrated modeling of the major events in the {MHC} class {I} antigen
	processing pathway.},
  journal = {Protein {S}ci.},
  year = {2005},
  volume = {14},
  pages = {2132-2140},
  month = {Jun},
  abstract = {Rational design of epitope-driven vaccines is a key goal of immunoinformatics.
	{T}ypically, candidate selection relies on the prediction of {MHC}-peptide
	binding only, as this is known to be the most selective step in the
	{MHC} class {I} antigen processing pathway. {H}owever, proteasomal
	cleavage and transport by the transporter associated with antigen
	processing ({TAP}) are essential steps in antigen processing as well.
	{W}hile prediction methods exist for the individual steps, no method
	has yet offered an integrated prediction of all three major processing
	events. {H}ere we present {WAPP}, a method combining prediction of
	proteasomal cleavage, {TAP} transport, and {MHC} binding into a single
	prediction system. {T}he proteasomal cleavage site prediction employs
	a new matrix-based method that is based on experimentally verified
	proteasomal cleavage sites. {S}upport vector regression is used for
	predicting peptides transported by {TAP}. {MHC} binding is the last
	step in the antigen processing pathway and was predicted using a
	support vector machine method, {SVMHC}. {T}he individual methods
	are combined in a filtering approach mimicking the natural processing
	pathway. {WAPP} thus predicts peptides that are cleaved by the proteasome
	at the {C} terminus, transported by {TAP}, and show significant affinity
	to {MHC} class {I} molecules. {T}his results in a decrease in false
	positive rates compared to {MHC} binding prediction alone. {C}ompared
	to prediction of {MHC} binding only, we report an increased overall
	accuracy and a lower rate of false positive predictions for the {HLA}-{A}*0201,
	{HLA}-{B}*2705, {HLA}-{A}*01, and {HLA}-{A}*03 alleles using {WAPP}.
	{T}he method is available online through our prediction server at
	http://www-bs.informatik.uni-tuebingen.de/{WAPP}.},
  doi = {10.1110/ps.051352405},
  pdf = {../local/Donnes2005Integrated.pdf},
  file = {Donnes2005Integrated.pdf:local/Donnes2005Integrated.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  pii = {ps.051352405},
  url = {http://dx.doi.org/10.1110/ps.051352405}
}
@article{Ehlers2005NBS1,
  author = {Justis P Ehlers and J. William Harbour},
  title = {N{BS}1 expression as a prognostic marker in uveal melanoma.},
  journal = {Clin. {C}ancer {R}es.},
  year = {2005},
  volume = {11},
  pages = {1849-53},
  number = {5},
  month = {Mar},
  abstract = {P{URPOSE}: {U}p to half of uveal melanoma patients die of metastatic
	disease. {T}reatment of the primary eye tumor does not improve survival
	in high-risk patients due to occult micrometastatic disease, which
	is present at the time of eye tumor diagnosis but is not detected
	and treated until months to years later. {H}ere, we use microarray
	gene expression data to identify a new prognostic marker. {EXPERIMENTAL}
	{DESIGN}: {M}icroarray gene expression profiles were analyzed in
	25 primary uveal melanomas. {T}umors were ranked by support vector
	machine ({SVM}) and by cytologic severity. {N}bs1 protein expression
	was assessed by quantitative immunohistochemistry in 49 primary uveal
	melanomas. {S}urvival was assessed using {K}aplan-{M}eier life-table
	analysis. {RESULTS}: {E}xpression of the {N}ijmegen breakage syndrome
	({NBS}1) gene correlated strongly with {SVM} and cytologic tumor
	rankings ({P} < 0.0001). {F}urther, immunohistochemistry expression
	of the {N}bs1 protein correlated strongly with both {SVM} and cytologic
	rankings ({P} < 0.0001). {T}he 6-year actuarial survival was 100\%
	in patients with low immunohistochemistry expression of {N}bs1 and
	22\% in those with high {N}bs1 expression ({P} = 0.01). {CONCLUSIONS}:
	{NBS}1 is a strong predictor of uveal melanoma survival and potentially
	could be used as a clinical marker for guiding clinical management.},
  doi = {10.1158/1078-0432.CCR-04-2054},
  pdf = {../local/Ehlers2005NBS1.pdf},
  file = {Ehlers2005NBS1.pdf:local/Ehlers2005NBS1.pdf:PDF},
  keywords = {80 and over, Adult, Aged, Algorithms, Amino Acid Sequence, Amino Acids,
	Analysis of Variance, Animals, Area Under Curve, Artifacts, Automated,
	Bacteriophage T4, Base Sequence, Biological, Birefringence, Brain
	Chemistry, Brain Neoplasms, Cell Cycle Proteins, Comparative Study,
	Computational Biology, Computer-Assisted, Cornea, Cross-Sectional
	Studies, Databases, Decision Trees, Diagnosis, Diagnostic Imaging,
	Diagnostic Techniques, Discriminant Analysis, Evolution, Extramural,
	Face, Female, Gene Expression Profiling, Genetic, Glaucoma, Humans,
	Immunohistochemistry, Intraocular Pressure, Lasers, Least-Squares
	Analysis, Likelihood Functions, Magnetic Resonance Imaging, Magnetic
	Resonance Spectroscopy, Male, Markov Chains, Melanoma, Middle Aged,
	Models, Molecular, Mutation, N.I.H., Nerve Fibers, Non-P.H.S., Non-U.S.
	Gov't, Nuclear Proteins, Nucleic Acid, Nucleic Acid Conformation,
	Numerical Analysis, Oligonucleotide Array Sequence Analysis, Ophthalmological,
	Optic Nerve Diseases, Optical Coherence, P.H.S., Pattern Recognition,
	Photic Stimulation, Polymorphism, Prognosis, Prospective Studies,
	Protein, Protein Structure, Proteins, RNA, ROC Curve, Regression
	Analysis, Reproducibility of Results, Research Support, Retinal Ganglion
	Cells, Secondary, Sensitivity and Specificity, Sequence Analysis,
	Single Nucleotide, Single-Stranded Conformational, Software, Statistics,
	Survival Analysis, Tertiary, Tomography, Tumor Markers, U.S. Gov't,
	Untranslated, Uveal Neoplasms, Visual Fields, beta-Lactamases, 15756009},
  pii = {11/5/1849},
  url = {http://clincancerres.aacrjournals.org/cgi/content/abstract/11/5/1849}
}
@article{Eid2009Real,
  author = {John Eid and Adrian Fehr and Jeremy Gray and Khai Luong and John
	Lyle and Geoff Otto and Paul Peluso and David Rank and Primo Baybayan
	and Brad Bettman and Arkadiusz Bibillo and Keith Bjornson and Bidhan
	Chaudhuri and Frederick Christians and Ronald Cicero and Sonya Clark
	and Ravindra Dalal and Alex Dewinter and John Dixon and Mathieu Foquet
	and Alfred Gaertner and Paul Hardenbol and Cheryl Heiner and Kevin
	Hester and David Holden and Gregory Kearns and Xiangxu Kong and Ronald
	Kuse and Yves Lacroix and Steven Lin and Paul Lundquist and Congcong
	Ma and Patrick Marks and Mark Maxham and Devon Murphy and Insil Park
	and Thang Pham and Michael Phillips and Joy Roy and Robert Sebra
	and Gene Shen and Jon Sorenson and Austin Tomaney and Kevin Travers
	and Mark Trulson and John Vieceli and Jeffrey Wegener and Dawn Wu
	and Alicia Yang and Denis Zaccarin and Peter Zhao and Frank Zhong
	and Jonas Korlach and Stephen Turner},
  title = {Real-time DNA sequencing from single polymerase molecules.},
  journal = {Science},
  year = {2009},
  volume = {323},
  pages = {133--138},
  number = {5910},
  month = {Jan},
  abstract = {We present single-molecule, real-time sequencing data obtained from
	a DNA polymerase performing uninterrupted template-directed synthesis
	using four distinguishable fluorescently labeled deoxyribonucleoside
	triphosphates (dNTPs). We detected the temporal order of their enzymatic
	incorporation into a growing DNA strand with zero-mode waveguide
	nanostructure arrays, which provide optical observation volume confinement
	and enable parallel, simultaneous detection of thousands of single-molecule
	sequencing reactions. Conjugation of fluorophores to the terminal
	phosphate moiety of the dNTPs allows continuous observation of DNA
	synthesis over thousands of bases without steric hindrance. The data
	report directly on polymerase dynamics, revealing distinct polymerization
	states and pause sites corresponding to DNA secondary structure.
	Sequence data were aligned with the known reference sequence to assay
	biophysical parameters of polymerization for each template position.
	Consensus sequences were generated from the single-molecule reads
	at 15-fold coverage, showing a median accuracy of 99.3\%, with no
	systematic error beyond fluorophore-dependent error rates.},
  doi = {10.1126/science.1162986},
  institution = {Pacific Biosciences, 1505 Adams Drive, Menlo Park, CA 94025, USA.},
  keywords = {Base Sequence; Consensus Sequence; DNA, Circular, chemistry; DNA,
	Single-Stranded, chemistry; DNA, biosynthesis; DNA-Directed DNA Polymerase,
	metabolism; Deoxyribonucleotides, metabolism; Enzymes, Immobilized;
	Fluorescent Dyes; Kinetics; Nanostructures; Sequence Analysis, DNA,
	methods; Spectrometry, Fluorescence},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {1162986},
  pmid = {19023044},
  timestamp = {2010.07.28},
  url = {http://dx.doi.org/10.1126/science.1162986}
}
@article{Ekins2002Towards,
  author = {S. Ekins and B. Boulanger and P. W. Swaan and M. A. Z. Hupcey},
  title = {{T}owards a new age of virtual {ADME}/{TOX} and multidimensional
	drug discovery.},
  journal = {J Comput Aided Mol Des},
  year = {2002},
  volume = {16},
  pages = {381--401},
  number = {5-6},
  abstract = {With the continual pressure to ensure follow-up molecules to billion
	dollar blockbuster drugs, there is a hurdle in profitability and
	growth for pharmaceutical companies in the next decades. With each
	success and failure we increasingly appreciate that a key to the
	success of synthesized molecules through the research and development
	process is the possession of drug-like properties. These properties
	include an adequate bioactivity as well as adequate solubility, an
	ability to cross critical membranes (intestinal and sometimes blood-brain
	barrier), reasonable metabolic stability and of course safety in
	humans. Dependent on the therapeutic area being investigated it might
	also be desirable to avoid certain enzymes or transporters to circumvent
	potential drug-drug interactions. It may also be important to limit
	the induction of these same proteins that can result in further toxicities.
	We have clearly moved the assessment of in vitro absorption, distribution,
	metabolism, excretion and toxicity (ADME/TOX) parameters much earlier
	in the discovery organization than a decade ago with the inclusion
	of higher throughput systems. We are also now faced with huge amounts
	of ADME/TOX data for each molecule that need interpretation and also
	provide a valuable resource for generating predictive computational
	models for future drug discovery. The present review aims to show
	what tools exist today for visualizing and modeling ADME/TOX data,
	what tools need to be developed, and how both the present and future
	tools are valuable for virtual filtering using ADME/TOX and bioactivity
	properties in parallel as a viable addition to present practices.},
  keywords = {ATP-Binding Cassette Transporters, Algorithms, Animals, Biological,
	Biological Availability, Computer Simulation, Drug Design, Drug Evaluation,
	Drug Industry, Gene Expression Profiling, Humans, Models, Organic
	Anion Transporters, P.H.S., Pharmaceutical, Pharmaceutical Preparations,
	Pharmacogenetics, Pharmacokinetics, Preclinical, Proteomics, Research
	Support, Software, Systems Biology, Technology, Toxicity Tests, U.S.
	Gov't, 12489686},
  owner = {mahe},
  pmid = {12489686},
  timestamp = {2006.08.16}
}
@article{Engelhardt2005Protein,
  author = {Engelhardt, B. E. and Jordan, M. I. and Muratore, K. E. and Brenner,
	S. E.},
  title = {Protein {M}olecular {F}unction {P}rediction by {B}ayesian {P}hylogenomics.},
  journal = {P{L}o{S} {C}omput. {B}iol.},
  year = {2005},
  volume = {1},
  pages = {e45},
  number = {5},
  month = {Oct},
  abstract = {We present a statistical graphical model to infer specific molecular
	function for unannotated protein sequences using homology. {B}ased
	on phylogenomic principles, {SIFTER} ({S}tatistical {I}nference of
	{F}unction {T}hrough {E}volutionary {R}elationships) accurately predicts
	molecular function for members of a protein family given a reconciled
	phylogeny and available function annotations, even when the data
	are sparse or noisy. {O}ur method produced specific and consistent
	molecular function predictions across 100 {P}fam families in comparison
	to the {G}ene {O}ntology annotation database, {BLAST}, {GO}tcha,
	and {O}rthostrapper. {W}e performed a more detailed exploration of
	functional predictions on the adenosine-5'-monophosphate/adenosine
	deaminase family and the lactate/malate dehydrogenase family, in
	the former case comparing the predictions against a gold standard
	set of published functional characterizations. {G}iven function annotations
	for 3\% of the proteins in the deaminase family, {SIFTER} achieves
	96\% accuracy in predicting molecular function for experimentally
	characterized proteins as reported in the literature. {T}he accuracy
	of {SIFTER} on this dataset is a significant improvement over other
	currently available methods such as {BLAST} (75\%), {G}ene{Q}uiz
	(64\%), {GO}tcha (89\%), and {O}rthostrapper (11\%). {W}e also experimentally
	characterized the adenosine deaminase from {P}lasmodium falciparum,
	confirming {SIFTER}'s prediction. {T}he results illustrate the predictive
	power of exploiting a statistical model of function evolution in
	phylogenomic problems. {A} software implementation of {SIFTER} is
	available from the authors.},
  doi = {10.1371/journal.pcbi.0010045},
  pdf = {../local/Engelhardt2005Protein.pdf},
  file = {Engelhardt2005Protein.pdf:local/Engelhardt2005Protein.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pmid = {16217548},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1371/journal.pcbi.0010045}
}
@article{Felsenstein1981Evolutionary,
  author = {J. Felsenstein},
  title = {Evolutionary trees from {DNA} sequences: a maximum likelihood approach},
  journal = {Journal of {M}olecular {E}volution},
  year = {1981},
  volume = {17},
  pages = {368--376},
  subject = {bio}
}
@article{Fields1999Functional,
  author = {Fields, S. and Kohara, Y. and Lockhart, D. J.},
  title = {Functional genomics},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {1999},
  volume = {96},
  pages = {8825--8826},
  month = {August},
  pdf = {../local/fiel99.pdf},
  file = {fiel99.pdf:local/fiel99.pdf:PDF},
  subject = {bio},
  url = {http://www.pnas.org/cgi/reprint/96/16/8825.pdf}
}
@article{Fong2004Predicting,
  author = {Fong, J. H. and Keating, A. E. and Singh, M.},
  title = {Predicting specificity in b{ZIP} coiled-coil protein interactions},
  journal = {Genome {B}iol.},
  year = {2004},
  volume = {5},
  number = {R11},
  abstract = {We present a method for predicting protein-protein interactions mediated
	by the coiled-coil motif. {W}hen tested on interactions between nearly
	all human and yeast b{ZIP} proteins, our method identifies 70% of
	strong interactions while maintaining that 92% of predictions are
	correct. {F}urthermore, cross-validation testing shows that including
	the b{ZIP} experimental data significantly improves performance.
	{O}ur method can be used to predict b{ZIP} interactions in other
	genomes and is a promising approach for predicting coiled-coil interactions
	more generally.},
  pdf = {../local/Fong2004Predicting.pdf},
  file = {Fong2004Predicting.pdf:local/Fong2004Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://genomebiology.com/2004/5/2/R11}
}
@article{Friedel2005Support,
  author = {Friedel, C. C. and Jahn, K. H. V. and Sommer, S. and Rudd, S. and
	Mewes, H. W. and Tetko, I. V.},
  title = {Support vector machines for separation of mixed plant-pathogen {EST}
	collections based on codon usage},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1383-1388},
  abstract = {Motivation: {D}iscovery of host and pathogen genes expressed at the
	plant-pathogen interface often requires the construction of mixed
	libraries that contain sequences from both genomes. {S}equence identification
	requires high-throughput and reliable classification of genome origin.
	{W}hen using single-pass c{DNA} sequences difficulties arise from
	the short sequence length, the lack of sufficient taxonomically relevant
	sequence data in public databases and ambiguous sequence homology
	between plant and pathogen genes.{R}esults: {A} novel method is described,
	which is independent of the availability of homologous genes and
	relies on subtle differences in codon usage between plant and fungal
	genes. {W}e used support vector machines ({SVM}s) to identify the
	probable origin of sequences. {SVM}s were compared to several other
	machine learning techniques and to a probabilistic algorithm ({PF}-{IND},
	{M}aor et al., 2003) for {EST} classification also based on codon
	bias differences. {O}ur software ({ECLAT}) has achieved a classification
	accuracy of 93.1% on a test set of 3217 {EST} sequences from {H}.
	vulgare and {B}. graminis, which is a significant improvement compared
	to {PF}-{IND} (prediction accuracy of 81.2% on the same test set).
	{EST} sequences with at least 50 nt of coding sequence can be classified
	by {ECLAT} with high confidence. {ECLAT} allows training of classifiers
	for any host-pathogen combination for which there are sufficient
	classified training sequences.{A}vailability: {ECLAT} is freely available
	on the internet (http://mips.gsf.de/proj/est) or on request as a
	standalone version.},
  doi = {10.1093/bioinformatics/bti200},
  pdf = {../local/Friedel2005Support.pdf},
  file = {Friedel2005Support.pdf:local/Friedel2005Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti200v1}
}
@article{Friedman2000Using,
  author = {Friedman, N. and Linial, M. and Nachman, I. and Pe'er, D.},
  title = {Using {B}ayesian Networks to Analyze Expression Data},
  journal = {J. Comput. Biol.},
  year = {2000},
  volume = {7},
  pages = {601--620},
  number = {3-4},
  abstract = {D{NA} hybridization arrays simultaneously measure the expression level
	for thousands of genes. {T}hese measurements provide a "snapshot"
	of transcription levels within the cell. {A} major challenge in computational
	biology is to uncover, from such measurements, gene/protein interactions
	and key biological features of cellular systems. {I}n this paper,
	we propose a new framework for discovering interactions between genes
	based on multiple expression measurements. {T}his framework builds
	on the use of {B}ayesian networks for representing statistical dependencies.
	{A} {B}ayesian network is a graph-based model of joint multivariate
	probability distributions that captures properties of conditional
	independence between variables. {S}uch models are attractive for
	their ability to describe complex stochastic processes and because
	they provide a clear methodology for learning from (noisy) observations.
	{W}e start by showing how {B}ayesian networks can describe interactions
	between genes. {W}e then describe a method for recovering gene interactions
	from microarray data using tools for learning {B}ayesian networks.
	{F}inally, we demonstrate this method on the {S}. cerevisiae cell-cycle
	measurements of {S}pellman et al. (1998).},
  doi = {10.1089/106652700750050961},
  pdf = {../local/Friedman2000Using.pdf},
  file = {Friedman2000Using.pdf:local/Friedman2000Using.pdf:PDF},
  keywords = {biogm},
  subject = {microarray},
  url = {http://dx.doi.org/10.1089/106652700750050961}
}
@article{Fritz2002Microarray-based,
  author = {Fritz, B. and Schubert, F. and Wrobel, G. and Schwaenen, C. and Wessendorf,
	S. and Nessling, M. and Korz, C. and Rieker, R. J. and Montgomery,
	K. and Kucherlapati, R. and Mechtersheimer, G. and Eils, R. and Joos,
	S. and Lichter, P.},
  title = {Microarray-based {C}opy {N}umber and {E}xpression {P}rofiling in
	{D}edifferentiated and {P}leomorphic {L}iposarcoma},
  journal = {Cancer {R}es.},
  year = {2002},
  volume = {62},
  pages = {2993-2998},
  number = {11},
  abstract = {Sixteen dedifferentiated and pleomorphic liposarcomas were analyzed
	by comparative genomic hybridization ({CGH}) to genomic microarrays
	(matrix-{CGH}), c{DNA}-derived microarrays for expression profiling,
	and by quantitative {PCR}. {M}atrix-{CGH} revealed copy number gains
	of numerous oncogenes, i.e., {CCND}1, {MDM}2, {GLI}, {CDK}4, {MYB},
	{ESR}1, and {AIB}1, several of which correlate with a high level
	of transcripts from the respective gene. {I}n addition, a number
	of genes were found differentially expressed in dedifferentiated
	and pleomorphic liposarcomas. {A}pplication of dedicated clustering
	algorithms revealed that both tumor subtypes are clearly separated
	by the genomic profiles but only with a lesser power by the expression
	profiles. {U}sing a support vector machine, a subset of five clones
	was identified as "class discriminators." {T}hus, for the distinction
	of these types of liposarcomas, genomic profiling appears to be more
	advantageous than {RNA} expression analysis.},
  pdf = {../local/Fritz2002Microarray-based.pdf},
  file = {Fritz2002Microarray-based.pdf:local/Fritz2002Microarray-based.pdf:PDF},
  keywords = {biosvm, cgh},
  owner = {jeanphilippevert},
  url = {http://cancerres.aacrjournals.org/cgi/content/abstract/62/11/2993}
}
@article{Fullwood2010Chromatin,
  author = {Melissa J Fullwood and Yuyuan Han and Chia-Lin Wei and Xiaoan Ruan
	and Yijun Ruan},
  title = {Chromatin interaction analysis using paired-end tag sequencing.},
  journal = {Curr Protoc Mol Biol},
  year = {2010},
  volume = {Chapter 21},
  pages = {Unit 21.15.1--Unit 21.1525},
  month = {Jan},
  abstract = {Chromatin Interaction Analysis using Paired-End Tag sequencing (ChIA-PET)
	is a technique developed for large-scale, de novo analysis of higher-order
	chromatin structures. Cells are treated with formaldehyde to cross-link
	chromatin interactions, DNA segments bound by protein factors are
	enriched by chromatin immunoprecipitation, and interacting DNA fragments
	are then captured by proximity ligation. The Paired-End Tag (PET)
	strategy is applied to the construction of ChIA-PET libraries, which
	are sequenced by high-throughput next-generation sequencing technologies.
	Finally, raw PET sequences are subjected to bioinformatics analysis,
	resulting in a genome-wide map of binding sites and chromatin interactions
	mediated by the protein factor under study. This unit describes ChIA-PET
	for genome-wide analysis of chromatin interactions in mammalian cells,
	with the application of Roche/454 and Illumina sequencing technologies.},
  doi = {10.1002/0471142727.mb2115s89},
  institution = {Genome Institute of Singapore, Agency for Science, Technology and
	Research, Singapore.},
  keywords = {Animals; Chromatin; Computational Biology; Databases, Nucleic Acid;
	Genome-Wide Association Study; Humans; Sequence Analysis, DNA},
  owner = {phupe},
  pmid = {20069536},
  timestamp = {2010.08.26},
  url = {http://dx.doi.org/10.1002/0471142727.mb2115s89}
}
@article{Furey2000Support,
  author = {Furey, T. S. and Cristianini, N. and Duffy, N. and Bednarski, D.
	W. and Schummer, M. and Haussler, D.},
  title = {Support vector machine classification and validation of cancer tissue
	samples using microarray expression data},
  journal = {Bioinformatics},
  year = {2000},
  volume = {16},
  pages = {906-914},
  number = {10},
  month = {Oct},
  abstract = {Motivation: {DNA} microarray experiments generating thousands of gene
	expression measurements, are being used to gather information from
	tissue and cell samples regarding gene expression differences that
	will be useful in diagnosing disease. {W}e have developed a new method
	to analyse this kind of data using support vector machines ({SVM}s).
	{T}his analysis consists of both classification of the tissue samples,
	and an exploration of the data for mis-labeled or questionable tissue
	results. {R}esults: {W}e demonstrate the method in detail on samples
	consisting of ovarian cancer tissues, normal ovarian tissues, and
	other normal tissues. {T}he dataset consists of expression experiment
	results for 97802 c{DNA}s for each tissue. {A}s a result of computational
	analysis, a tissue sample is discovered and confirmed to be wrongly
	labeled. {U}pon correction of this mistake and the removal of an
	outlier, perfect classification of tissues is achieved, but not with
	high confidence. {W}e identify and analyse a subset of genes from
	the ovarian dataset whose expression is highly differentiated between
	the types of tissues. {T}o show robustness of the {SVM} method, two
	previously published datasets from other types of tissues or cells
	are analysed. {T}he results are comparable to those previously obtained.
	{W}e show that other machine learning methods also perform comparably
	to the {SVM} on many of those datasets. {A}vailability: {T}he {SVM}
	software is available at http://www.cs.columbia.edu/~bgrundy/svm.
	{C}ontact: booch@cse.ucsc.edu},
  pdf = {../local/Furey2000Support.pdf},
  file = {Furey2000Support.pdf:local/Furey2000Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/16/10/906}
}
@article{Furlanello2003Entropy-based,
  author = {Furlanello, C. and Serafini, M. and Merler, S. and Jurman, G.},
  title = {Entropy-based gene ranking without selection bias for the predictive
	classification of microarray data},
  journal = {B{MC} {B}ioinformatics},
  year = {2003},
  volume = {4},
  number = {54},
  abstract = {Background {W}e describe the {E}-{RFE} method for gene ranking, which
	is useful for the identification of markers in the predictive classification
	of array data. {T}he method supports a practical modeling scheme
	designed to avoid the construction of classification rules based
	on the selection of too small gene subsets (an effect known as the
	selection bias, in which the estimated predictive errors are too
	optimistic due to testing on samples already considered in the feature
	selection process). {R}esults {W}ith {E}-{RFE}, we speed up the recursive
	feature elimination ({RFE}) with {SVM} classifiers by eliminating
	chunks of uninteresting genes using an entropy measure of the {SVM}
	weights distribution. {A}n optimal subset of genes is selected according
	to a two-strata model evaluation procedure: modeling is replicated
	by an external stratified-partition resampling scheme, and, within
	each run, an internal {K}-fold cross-validation is used for {E}-{RFE}
	ranking. {A}lso, the optimal number of genes can be estimated according
	to the saturation of {Z}ipf's law profiles. {C}onclusions {W}ithout
	a decrease of classification accuracy, {E}-{RFE} allows a speed-up
	factor of 100 with respect to standard {RFE}, while improving on
	alternative parametric {RFE} reduction strategies. {T}hus, a process
	for gene selection and error estimation is made practical, ensuring
	control of the selection bias, and providing additional diagnostic
	indicators of gene importance.},
  doi = {10.1186/1471-2105-4-54},
  pdf = {../local/Furlanello2003Entropy-based.pdf},
  file = {Furlanello2003Entropy-based.pdf:local/Furlanello2003Entropy-based.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/4/54}
}
@article{Gangal2005Human,
  author = {Rajeev Gangal and Pankaj Sharma},
  title = {Human pol {II} promoter prediction: time series descriptors and machine
	learning.},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {1332-6},
  number = {4},
  abstract = {Although several in silico promoter prediction methods have been developed
	to date, they are still limited in predictive performance. {T}he
	limitations are due to the challenge of selecting appropriate features
	of promoters that distinguish them from non-promoters and the generalization
	or predictive ability of the machine-learning algorithms. {I}n this
	paper we attempt to define a novel approach by using unique descriptors
	and machine-learning methods for the recognition of eukaryotic polymerase
	{II} promoters. {I}n this study, non-linear time series descriptors
	along with non-linear machine-learning algorithms, such as support
	vector machine ({SVM}), are used to discriminate between promoter
	and non-promoter regions. {T}he basic idea here is to use descriptors
	that do not depend on the primary {DNA} sequence and provide a clear
	distinction between promoter and non-promoter regions. {T}he classification
	model built on a set of 1000 promoter and 1500 non-promoter sequences,
	showed a 10-fold cross-validation accuracy of 87\% and an independent
	test set had an accuracy >85\% in both promoter and non-promoter
	identification. {T}his approach correctly identified all 20 experimentally
	verified promoters of human chromosome 22. {T}he high sensitivity
	and selectivity indicates that n-mer frequencies along with non-linear
	time series descriptors, such as {L}yapunov component stability and
	{T}sallis entropy, and supervised machine-learning methods, such
	as {SVM}s, can be useful in the identification of pol {II} promoters.},
  doi = {10.1093/nar/gki271},
  pdf = {../local/Gangal2005Human.pdf},
  file = {Gangal2005Human.pdf:local/Gangal2005Human.pdf:PDF},
  keywords = {biosvm},
  pii = {33/4/1332},
  url = {http://dx.doi.org/10.1093/nar/gki271}
}
@article{Gardy2005PSORTb,
  author = {Gardy, J. L. and Laird, M. R. and Chen, F. and Rey, S. and Walsh,
	C. J. and Ester, M. and Brinkman, F. S. L.},
  title = {{{PSORT}b v.2.0}: expanded prediction of bacterial protein subcellular
	localization and insights gained from comparative proteome analysis},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {617-623},
  number = {5},
  month = {Mar},
  abstract = {Motivation: {PSORT}b v.1.1 is the most precise bacterial localization
	prediction tool available. {H}owever the program's predictive coverage
	and recall are low and the method is only applicable to {G}ram-negative
	bacteria. {T}he goals of the present work were: increase {PSORT}b's
	coverage while maintaining the existing precision level, expand it
	to include {G}ram-positive bacteria, and then carry out a comparative
	analysis of localization.{R}esults: {A}n expanded database of proteins
	of known localization and new modules using frequent subsequence-based
	support vector machines were introduced into {PSORT}b v.2.0. {T}he
	program attains a precision of 96% for {G}ram-positive and {G}ram-negative
	bacteria and predictive coverage comparable to other tools for whole
	proteome analysis. {W}e show that the proportion of proteins at each
	localization is remarkably consistent across species, even in species
	with varying proteome size.{A}vailability: {W}eb-based version: http://www.psort.org/psortb.
	{S}tandalone version: {A}vailable through the website under {GNU}
	{G}eneral {P}ublic {L}icense.{S}upplementary {I}nformation: http://www.psort.org/psortb/supplementaryinfo.html.},
  doi = {10.1093/bioinformatics/bti057},
  pdf = {../local/Gardy2005PSORTb.pdf},
  file = {Gardy2005PSORTb.pdf:local/Gardy2005PSORTb.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti057}
}
@article{Garg2005SVM-based,
  author = {Garg, A. and Bhasin, M. and Raghava, G.P.},
  title = {S{VM}-based method for subcellular localization of human proteins
	using amino acid compositions, their order and similarity search},
  journal = {J. {B}iol. {C}hem.},
  year = {2005},
  volume = {280},
  pages = {14427-32},
  number = {15},
  month = {Apr},
  abstract = {Here we report a systematic approach for predicting subcellular localization
	(cytoplasm, mitochondrial, nuclear and plasma membrane) of human
	proteins. {F}irstly, {SVM} based modules for predicting subcellular
	localization using traditional amino acid and dipeptide (i+1) composition
	achieved overall accuracy of 76.6% and 77.8%, respectively. {PSI}-{BLAST}
	when carried out using similarity-based search against non-redundant
	database of experimentally annotated proteins yielded 73.3% accuracy.
	{T}o gain further insight, hybrid module (hybrid1) was developed
	based on amino acid composition, dipeptide composition, and similarity
	information and attained better accuracy of 84.9%. {I}n addition,
	{SVM} module based on different higher order dipeptide i.e. i+2,
	i+3, and i+4 were also constructed for the prediction of subcellular
	localization of human proteins and overall accuracy of 79.7%, 77.5%
	and 77.1% was accomplished respectively. {F}urthermore, another {SVM}
	module hybrid2 was developed using traditional dipeptide (i+1) and
	higher order dipeptide (i+2, i+3, and i+4) compositions, which gave
	an overall accuracy of 81.3%. {W}e also developed {SVM} module hybrid3
	based on amino acid composition, traditional and higher order dipeptide
	compositions and {PSI}-{BLAST} output and achieved an overall accuracy
	of 84.4%. {A} web server {HSLP}red (http://www.imtech.res.in/raghava/hslpred/
	or http://bioinformatics.uams.edu/raghava/hslpred/) has been designed
	to predict subcellular localization of human proteins using the above
	approaches.},
  doi = {10.1074/jbc.M411789200},
  pdf = {../local/Garg2005SVM-based.pdf},
  file = {Garg2005SVM-based.pdf:local/Garg2005SVM-based.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1074/jbc.M411789200}
}
@article{Garrett2003Comparison,
  author = {D. Garrett and D. A Peterson and C. Anderson and M. Thaut},
  title = {Comparison of linear, nonlinear, and feature selection methods for
	{EEG} signal classification.},
  journal = {I{EEE} {T}rans {N}eural {S}yst {R}ehabil {E}ng},
  year = {2003},
  volume = {11},
  pages = {141-4},
  number = {2},
  month = {Jun},
  abstract = {The reliable operation of brain-computer interfaces ({BCI}s) based
	on spontaneous electroencephalogram ({EEG}) signals requires accurate
	classification of multichannel {EEG}. {T}he design of {EEG} representations
	and classifiers for {BCI} are open research questions whose difficulty
	stems from the need to extract complex spatial and temporal patterns
	from noisy multidimensional time series obtained from {EEG} measurements.
	{T}he high-dimensional and noisy nature of {EEG} may limit the advantage
	of nonlinear classification methods over linear ones. {T}his paper
	reports the results of a linear (linear discriminant analysis) and
	two nonlinear classifiers (neural networks and support vector machines)
	applied to the classification of spontaneous {EEG} during five mental
	tasks, showing that nonlinear classifiers produce only slightly better
	classification results. {A}n approach to feature selection based
	on genetic algorithms is also presented with preliminary results
	of application to {EEG} during finger movement.},
  keywords = {80 and over, Adnexal Diseases, Adult, Aged, Algorithms, Artificial
	Intelligence, Automated, Bayes Theorem, Biological, Brain, Brain
	Mapping, Breast Neoplasms, Case-Control Studies, Chromatography,
	Comparative Study, Computational Biology, Computer Simulation, Computer-Assisted,
	DNA, Diagnosis, Differential, Discriminant Analysis, Electroencephalography,
	Evoked Potentials, Feasibility Studies, Female, Fingers, Gene Expression
	Profiling, Gene Expression Regulation, Genetic, Genetic Markers,
	Genetic Predisposition to Disease, Genetic Screening, Habituation
	(Psychophysiology), High Pressure Liquid, Humans, Linear Models,
	Logistic Models, Male, Middle Aged, Migraine, Models, Movement, Neural
	Networks (Computer), Neurological, Non-P.H.S., Non-U.S. Gov't, Nonlinear
	Dynamics, Nucleosides, Ovarian Neoplasms, Pattern Recognition, Photic
	Stimulation, Predictive Value of Tests, ROC Curve, Reproducibility
	of Results, Research Support, Sensitivity and Specificity, Signal
	Processing, Software, Statistical, Thinking, Tumor Markers, U.S.
	Gov't, User-Computer Interface, Visual, 12899257}
}
@article{Gaudan2005Resolving,
  author = {Gaudan, S. and Kirsch, H. and Rebholz-Schuhmann, D.},
  title = {Resolving abbreviations to their senses in {M}edline.},
  journal = {Bioinformatics},
  year = {2005},
  month = {Jul},
  abstract = {M{OTIVATION}: {B}iological literature contains many abbreviations
	with one particular sense in each document. {H}owever, most abbreviations
	do not have a unique sense across the literature. {F}urthermore,
	many documents do not contain the long-forms of the abbreviations.
	{R}esolving an abbreviation in a document consists of retrieving
	its sense in use. {A}bbreviation resolution improves accuracy of
	document retrieval engines and of information extraction systems.
	{RESULTS}: {W}e combine an automatic analysis of {M}edline abstracts
	and linguistic methods to build a dictionary of abbreviation/sense
	pairs. {T}he dictionary is used for the resolution of abbreviations
	occurring with their long-forms. {A}mbiguous global abbreviations
	are resolved using {S}upport {V}ector {M}achines that have been trained
	on the context of each instance of the abbreviation/sense pairs,
	previously extracted for the dictionary setup. {T}he system disambiguates
	abbreviations with a precision of 98.9\% for a recall of 98.2\% (98.5\%
	accuracy). {T}his performance is superior in comparison to previously
	reported research work. {AVAILABILITY}: {T}he abbreviation resolution
	module is available at http://www.ebi.ac.uk/{R}ebholz/software.html.},
  doi = {10.1093/bioinformatics/bti586},
  pdf = {../local/Gaudan2005Resolving.pdf},
  file = {Gaudan2005Resolving.pdf:local/Gaudan2005Resolving.pdf:PDF},
  keywords = {biosvm nlp},
  pii = {bti586},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti586}
}
@article{Ge2003Reducing,
  author = {Xijin Ge and Shuichi Tsutsumi and Hiroyuki Aburatani and Shuichi
	Iwata},
  title = {Reducing false positives in molecular pattern recognition.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2003},
  volume = {14},
  pages = {34-43},
  abstract = {In the search for new cancer subtypes by gene expression profiling,
	it is essential to avoid misclassifying samples of unknown subtypes
	as known ones. {I}n this paper, we evaluated the false positive error
	rates of several classification algorithms through a 'null test'
	by presenting classifiers a large collection of independent samples
	that do not belong to any of the tumor types in the training dataset.
	{T}he benchmark dataset is available at www2.genome.rcast.u-tokyo.ac.jp/pm/.
	{W}e found that k-nearest neighbor ({KNN}) and support vector machine
	({SVM}) have very high false positive error rates when fewer genes
	(<100) are used in prediction. {T}he error rate can be partially
	reduced by including more genes. {O}n the other hand, prototype matching
	({PM}) method has a much lower false positive error rate. {S}uch
	robustness can be achieved without loss of sensitivity by introducing
	suitable measures of prediction confidence. {W}e also proposed a
	cluster-and-select technique to select genes for classification.
	{T}he nonparametric {K}ruskal-{W}allis {H} test is employed to select
	genes differentially expressed in multiple tumor types. {T}o reduce
	the redundancy, we then divided these genes into clusters with similar
	expression patterns and selected a given number of genes from each
	cluster. {T}he reliability of the new algorithm is tested on three
	public datasets.},
  keywords = {Amino Acid Sequence, Amino Acids, Animals, Automated, Base Sequence,
	Bayes Theorem, Biological, Carbohydrate Conformation, Carbohydrate
	Sequence, Cattle, Computational Biology, Computer Simulation, Crystallography,
	DNA, Databases, Factual, False Positive Reactions, Gene Expression
	Profiling, Genes, Genetic, Genetic Techniques, Genome, Histocompatibility
	Antigens Class I, Human, Humans, Introns, Least-Squares Analysis,
	MHC Class I, Major Histocompatibility Complex, Markov Chains, Messenger,
	Mice, Models, Monosaccharides, Neoplasms, Non-U.S. Gov't, Nonparametric,
	Pattern Recognition, Peptides, Phylogeny, Plants, Poly A, Polysaccharides,
	Predictive Value of Tests, Protein, Protein Structure, Proteins,
	RNA, Rats, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Secondary, Sequence Alignment, Software, Species Specificity,
	Statistics, Theoretical, X-Ray, 15706518}
}
@article{Gehlenborg2010Visualization,
  author = {Nils Gehlenborg and Seán I O'Donoghue and Nitin S Baliga and Alexander
	Goesmann and Matthew A Hibbs and Hiroaki Kitano and Oliver Kohlbacher
	and Heiko Neuweger and Reinhard Schneider and Dan Tenenbaum and Anne-Claude
	Gavin},
  title = {Visualization of omics data for systems biology.},
  journal = {Nat Methods},
  year = {2010},
  volume = {7},
  pages = {S56--S68},
  number = {3 Suppl},
  month = {Mar},
  abstract = {High-throughput studies of biological systems are rapidly accumulating
	a wealth of 'omics'-scale data. Visualization is a key aspect of
	both the analysis and understanding of these data, and users now
	have many visualization methods and tools to choose from. The challenge
	is to create clear, meaningful and integrated visualizations that
	give biological insight, without being overwhelmed by the intrinsic
	complexity of the data. In this review, we discuss how visualization
	tools are being used to help interpret protein interaction, gene
	expression and metabolic profile data, and we highlight emerging
	new directions.},
  doi = {10.1038/nmeth.1436},
  institution = {European Bioinformatics Institute, Cambridge, UK.},
  keywords = {Genomics; Image Processing, Computer-Assisted; Mass Spectrometry;
	Metabolomics; Nuclear Magnetic Resonance, Biomolecular; Protein Binding;
	Proteomics; Systems Biology},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {nmeth.1436},
  pmid = {20195258},
  timestamp = {2010.07.27},
  url = {http://dx.doi.org/10.1038/nmeth.1436}
}
@article{Gether2000Uncovering,
  author = {U. Gether},
  title = {Uncovering molecular mechanisms involved in activation of G protein-coupled
	receptors.},
  journal = {Endocr Rev},
  year = {2000},
  volume = {21},
  pages = {90--113},
  number = {1},
  month = {Feb},
  abstract = {G protein-coupled, seven-transmembrane segment receptors (GPCRs or
	7TM receptors), with more than 1000 different members, comprise the
	largest superfamily of proteins in the body. Since the cloning of
	the first receptors more than a decade ago, extensive experimental
	work has uncovered multiple aspects of their function and challenged
	many traditional paradigms. However, it is only recently that we
	are beginning to gain insight into some of the most fundamental questions
	in the molecular function of this class of receptors. How can, for
	example, so many chemically diverse hormones, neurotransmitters,
	and other signaling molecules activate receptors believed to share
	a similar overall tertiary structure? What is the nature of the physical
	changes linking agonist binding to receptor activation and subsequent
	transduction of the signal to the associated G protein on the cytoplasmic
	side of the membrane and to other putative signaling pathways? The
	goal of the present review is to specifically address these questions
	as well as to depict the current awareness about GPCR structure-function
	relationships in general.},
  keywords = {Animals; GTP-Binding Proteins; Humans; Ligands; Models, Biological;
	Molecular Conformation; Receptors, Cell Surface},
  owner = {laurent},
  pmid = {10696571},
  timestamp = {2007.09.22}
}
@article{Girosi1998Equivalence,
  author = {Girosi},
  title = {An {E}quivalence {B}etween {S}parse {A}pproximation and {S}upport
	{V}ector {M}achines.},
  journal = {Neural {C}omput},
  year = {1998},
  volume = {10},
  pages = {1455-80},
  number = {6},
  month = {Jul},
  abstract = {This article shows a relationship between two different approximation
	techniques: the support vector machines ({SVM}), proposed by {V}.
	{V}apnik (1995) and a sparse approximation scheme that resembles
	the basis pursuit denoising algorithm ({C}hen, 1995; {C}hen, {D}onoho,
	and {S}aunders, 1995). {SVM} is a technique that can be derived from
	the structural risk minimization principle ({V}apnik, 1982) and can
	be used to estimate the parameters of several different approximation
	schemes, including radial basis functions, algebraic and trigonometric
	polynomials, {B}-splines, and some forms of multilayer perceptrons.
	{B}asis pursuit denoising is a sparse approximation technique in
	which a function is reconstructed by using a small number of basis
	functions chosen from a large set (the dictionary). {W}e show that
	if the data are noiseless, the modified version of basis pursuit
	denoising proposed in this article is equivalent to {SVM} in the
	following sense: if applied to the same data set, the two techniques
	give the same solution, which is obtained by solving the same quadratic
	programming problem. {I}n the appendix, we present a derivation of
	the {SVM} technique in one framework of regularization theory, rather
	than statistical learning theory, establishing a connection between
	{SVM}, sparse approximation, and regularization theory.},
  keywords = {Algorithms, Automated, Biometry, Computers, DNA, Databases, Factual,
	Fungal, Fungal Proteins, GTP-Binding Proteins, Gene Expression, Genes,
	Learning, Markov Chains, Models, Neural Networks (Computer), Neurological,
	Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Hybridization, Open Reading
	Frames, P.H.S., Pattern Recognition, Protein, Protein Structure,
	Proteins, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Sequence Alignment, Sequence Analysis, Software, Statistical,
	Tertiary, U.S. Gov't, 9698353}
}
@article{Glotsos2004Automated,
  author = {Dimitris Glotsos and Panagiota Spyridonos and Dionisis Cavouras and
	Panagiota Ravazoula and Petroula-Arampantoni Dadioti and George Nikiforidis},
  title = {Automated segmentation of routinely hematoxylin-eosin-stained microscopic
	images by combining support vector machine clustering and active
	contour models.},
  journal = {Anal {Q}uant {C}ytol {H}istol},
  year = {2004},
  volume = {26},
  pages = {331-40},
  number = {6},
  month = {Dec},
  abstract = {O{BJECTIVE}: {T}o develop a method for the automated segmentation
	of images of routinely hematoxylin-eosin ({H}-{E})-stained microscopic
	sections to guarantee correct results in computer-assisted microscopy.
	{STUDY} {DESIGN}: {C}linical material was composed 50 {H}-{E}-stained
	biopsies of astrocytomas and 50 {H}-{E}-stained biopsies of urinary
	bladder cancer. {T}he basic idea was to use a support vector machine
	clustering ({SVMC}) algorithm to provide gross segmentation of regions
	holding nuclei and subsequently to refine nuclear boundary detection
	with active contours. {T}he initialization coordinates of the active
	contour model were defined using a {SVMC} pixel-based classification
	algorithm that discriminated nuclear regions from the surrounding
	tissue. {S}tarting from the boundaries of these regions, the snake
	fired and propagated until converging to nuclear boundaries. {RESULTS}:
	{T}he method was validated for 2 different types of {H}-{E}-stained
	images. {R}esults were evaluated by 2 histopathologists. {O}n average,
	94\% of nuclei were correctly delineated. {CONCLUSION}: {T}he proposed
	algorithm could be of value in computer-based systems for automated
	interpretation of microscopic images.},
  keywords = {Adenosinetriphosphatase, Adolescent, Adult, Algorithms, Amino Acid
	Sequence, Amino Acids, Animals, Astrocytoma, Automated, Automation,
	Base Sequence, Bayes Theorem, Biological, Biopsy, Bladder Neoplasms,
	Breast Neoplasms, Carbohydrate Conformation, Carbohydrate Sequence,
	Cattle, Cell Cycle Proteins, Cell Nucleus, Computational Biology,
	Computer Simulation, Computer-Assisted, Crystallography, DNA, Databases,
	Diagnosis, Differential, Eosine Yellowish-(YS), Exoribonucleases,
	Factual, False Negative Reactions, False Positive Reactions, Female,
	Gene Expression, Gene Expression Profiling, Genes, Genetic, Genetic
	Techniques, Genetic Vectors, Genome, Hematoxylin, Histocompatibility
	Antigens Class I, Human, Humans, Image Interpretation, Image Processing,
	Introns, Least-Squares Analysis, MHC Class I, Major Histocompatibility
	Complex, Markov Chains, Messenger, Mice, Middle Aged, Models, Molecular
	Structure, Monosaccharides, Multigene Family, Mutation, Neoplasms,
	Neural Networks (Computer), Non-P.H.S., Non-U.S. Gov't, Nonparametric,
	Nucleotidyltransferases, Observer Variation, Oligonucleotide Array
	Sequence Analysis, P.H.S., Pattern Recognition, Peptides, Phenotype,
	Phylogeny, Plants, Poly A, Polysaccharides, Predictive Value of Tests,
	Protein, Protein Biosynthesis, Protein Kinase Inhibitors, Protein
	Structure, Proteins, RNA, RNA Helicases, RNA Splicing, Rats, Reproducibility
	of Results, Research Support, Retrospective Studies, Saccharomyces
	cerevisiae, Saccharomyces cerevisiae Proteins, Secondary, Sensitivity
	and Specificity, Sequence Alignment, Software, Species Specificity,
	Staining and Labeling, Statistics, Theoretical, Transcription, U.S.
	Gov't, Ultrasonography, X-Ray, 15678615}
}
@article{Glotsos2004Computer-based,
  author = {Dimitris Glotsos and Panagiota Spyridonos and Panagiotis Petalas
	and Dionisis Cavouras and Panagiota Ravazoula and Petroula-Arampatoni
	Dadioti and Ioanna Lekka and George Nikiforidis},
  title = {Computer-based malignancy grading of astrocytomas employing a support
	vector machine classifier, the {WHO} grading system and the regular
	hematoxylin-eosin diagnostic staining procedure.},
  journal = {Anal {Q}uant {C}ytol {H}istol},
  year = {2004},
  volume = {26},
  pages = {77-83},
  number = {2},
  month = {Apr},
  abstract = {O{BJECTIVE}: {T}o investigate and develop an automated technique for
	astrocytoma malignancy grading compatible with the clinical routine.
	{STUDY} {DESIGN}: {O}ne hundred forty biopsies of astrocytomas were
	collected from 2 hospitals. {T}he degree of tumor malignancy was
	defined as low or high according to the {W}orld {H}ealth {O}rganization
	grading system. {F}rom each biopsy, images were digitized and segmented
	to isolate nuclei from background tissue. {M}orphologic and textural
	nuclear features were quantified to encode tumor malignancy. {E}ach
	case was represented by a 40-dimensional feature vector. {A}n exhaustive
	search procedure in feature space was utilized to determine the best
	feature combination that resulted in the smallest classification
	error. {L}ow and high grade tumors were discriminated using support
	vector machines ({SVM}s). {T}o evaluate the system performance, all
	available data were split randomly into training and test sets. {RESULTS}:
	{T}he best vector combination consisted of 3 textural and 2 morphologic
	features. {L}ow and high grade cases were discriminated with an accuracy
	of 90.7\% and 88.9\%, respectively, using an {SVM} classifier with
	polynomial kernel of degree 2. {CONCLUSION}: {T}he proposed methodology
	was based on standards that are common in daily clinical practice
	and might be used in parallel with conventional grading as a second-opinion
	tool to reduce subjectivity in the classification of astrocytomas.},
  keywords = {Amino Acids, Antibodies, Artificial Intelligence, Astrocytoma, Biological,
	Biopsy, Brain, Brain Mapping, Brain Neoplasms, Calibration, Comparative
	Study, Computational Biology, Computer-Assisted, Cysteine, Cystine,
	Electrodes, Electroencephalography, Eosine Yellowish-(YS), Evoked
	Potentials, Female, Hematoxylin, Horseradish Peroxidase, Humans,
	Image Processing, Imagery (Psychotherapy), Imagination, Laterality,
	Male, Monoclonal, Movement, Neoplasms, Non-P.H.S., Non-U.S. Gov't,
	P.H.S., Perception, Principal Component Analysis, Protein, Protein
	Array Analysis, Proteins, Research Support, Sensitivity and Specificity,
	Sequence Analysis, Software, Tumor Markers, U.S. Gov't, User-Computer
	Interface, World Health Organization, 15131894}
}
@article{Goffeau1996Life,
  author = {A. Goffeau and B.G. Barrell and H. Bussey and R.W. Davis and B. Dujon
	and H. Feldmann and F. Galibert and J.D. Hoheisel and C. Jacq and
	M. Johnston and E.J. Louis and H.W. Mewes and Y. Murakami and P.
	Philippsen and H. Tettelin and S. G. Oliver},
  title = {Life with 6000 genes},
  journal = {Science},
  year = {1996},
  volume = {274},
  pages = {546--567},
  month = {October},
  doi = {10.1126/science.274.5287.546},
  pdf = {../local/Goffeau1996Life.pdf},
  file = {Goffeau1996Life.pdf:local/Goffeau1996Life.pdf:PDF},
  subject = {bio},
  url = {http://www.sciencemag.org/cgi/content/abstract/274/5287/546}
}
@article{Gomez2003Learning,
  author = {Gomez, S. M. and Noble, W. S. and Rzhetsky, A.},
  title = {Learning to predict protein-protein interactions from protein sequences},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1875-1881},
  number = {15},
  abstract = {In order to understand the molecular machinery of the cell, we need
	to know about the multitude of protein-protein interactions that
	allow the cell to function. {H}igh-throughput technologies provide
	some data about these interactions, but so far that data is fairly
	noisy. {T}herefore, computational techniques for predicting protein-protein
	interactions could be of significant value. {O}ne approach to predicting
	interactions in silico is to produce from first principles a detailed
	model of a candidate interaction. {W}e take an alternative approach,
	employing a relatively simple model that learns dynamically from
	a large collection of data. {I}n this work, we describe an attraction-repulsion
	model, in which the interaction between a pair of proteins is represented
	as the sum of attractive and repulsive forces associated with small,
	domain- or motif-sized features along the length of each protein.
	{T}he model is discriminative, learning simultaneously from known
	interactions and from pairs of proteins that are known (or suspected)
	not to interact. {T}he model is efficient to compute and scales well
	to very large collections of data. {I}n a cross-validated comparison
	using known yeast interactions, the attraction-repulsion method performs
	better than several competing techniques.},
  pdf = {../local/Gomez2003Learning.pdf},
  file = {Gomez2003Learning.pdf:local/Gomez2003Learning.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1875}
}
@article{Gordon2003Sequence,
  author = {Gordon, L. and Chervonenkis, A. Y. and Gammerman, A. J. and Shahmuradov,
	I. A. and Solovyev, V. V.},
  title = {Sequence alignment kernel for recognition of promoter regions},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1964-1971},
  number = {15},
  abstract = {In this paper we propose a new method for recognition of prokaryotic
	promoter regions with startpoints of transcription. {T}he method
	is based on {S}equence {A}lignment {K}ernel, a function reflecting
	the quantitative measure of match between two sequences. {T}his kernel
	function is further used in {D}ual {SVM}, which performs the recognition.
	{S}everal recognition methods have been trained and tested on positive
	data set, consisting of 669 {sigma}70-promoter regions with known
	transcription startpoints of {E}scherichia coli and two negative
	data sets of 709 examples each, taken from coding and non-coding
	regions of the same genome. {T}he results show that our method performs
	well and achieves 16.5% average error rate on positive & coding negative
	data and 18.6% average error rate on positive & non-coding negative
	data. {A}vailability:{T}he demo version of our method is accessible
	from our website http://mendel.cs.rhul.ac.uk/},
  pdf = {../local/Gordon2003Sequence.pdf},
  file = {Gordon2003Sequence.pdf:local/Gordon2003Sequence.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1964}
}
@article{Goto1998LIGAND:,
  author = {S. Goto and T. Nishioka and M. Kanehisa},
  title = {L{IGAND}: chemical database for enzyme reactions},
  journal = {Bioinformatics},
  year = {1998},
  volume = {14},
  pages = {591--599},
  pdf = {../local/goto98.pdf},
  file = {goto98.pdf:local/goto98.pdf:PDF},
  subject = {bionet},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/14/7/591}
}
@article{Goto2002LIGAND:,
  author = {S. Goto and Y. Okuno and M. Hattori and T. Nishioka and M. Kanehisa},
  title = {L{IGAND}: database of chemical compounds and reactions in biological
	pathways},
  journal = {Nucleic {A}cids {R}es.},
  year = {2002},
  volume = {30},
  pages = {402--404},
  pdf = {../local/goto02.pdf},
  file = {goto02.pdf:local/goto02.pdf:PDF},
  subject = {bionet},
  url = {http://nar.oupjournals.org/cgi/content/full/30/1/402}
}
@inproceedings{Grundy1998Family-based,
  author = {Grundy, W. N.},
  title = {Family-based {H}omology {D}etection via {P}airwise {S}equence {C}omparison},
  booktitle = {Proceedings of the {S}econd {A}nnual {I}nternational {C}onference
	on {C}omputational {M}olecular {B}iology, {M}arch 22-25},
  year = {1998},
  pages = {94--100},
  pdf = {../local/grun98.pdf},
  file = {grun98.pdf:local/grun98.pdf:PDF},
  subject = {biocasp},
  url = {http://www.cs.columbia.edu/~bgrundy/papers/compare.html}
}
@article{Guelzim2002Topological,
  author = {Guelzim, N. and Bottani, S. and Bourgine, P. and K{\'e}p{\`e}s, F.},
  title = {Topological and causal structure of the yeast transcriptional regulatory
	network},
  journal = {Nat. {G}enet.},
  year = {2002},
  volume = {31},
  pages = {60--63},
  pdf = {../local/guel02.pdf},
  file = {guel02.pdf:local/guel02.pdf:PDF},
  subject = {bionet},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/ng/journal/v31/n1/full/ng873.html}
}
@article{Guermeur2002Combining,
  author = {Guermeur, Y.},
  title = {Combining {D}iscriminant {M}odels with {N}ew {M}ulti-{C}lass {SVM}s},
  journal = {Pattern {A}nal. {A}ppl.},
  year = {2002},
  volume = {5},
  pages = {168-179},
  number = {2},
  abstract = {The idea of performing model combination, instead of model selection,
	has a long theoretical background in statistics. {H}owever, making
	use of theoretical results is ordinarily subject to the satisfaction
	of strong hypotheses (weak error correlation, availability of large
	training sets, possibility to rerun the training procedure an arbitrary
	number of times, etc.). {I}n contrast, the practitioner is frequently
	faced with the problem of combining a given set of pre-trained classifiers,
	with highly correlated errors, using only a small training sample.
	{O}verfitting is then the main risk, which cannot be overcome but
	with a strict complexity control of the combiner selected. {T}his
	suggests that {SVM}s should be well suited for these difficult situations.
	{I}nvestigating this idea, we introduce a family of multi-class {SVM}s
	and assess them as ensemble methods on a real-world problem. {T}his
	task, protein secondary structure prediction, is an open problem
	in biocomputing for which model combination appears to be an issue
	of central importance. {E}xperimental evidence highlights the gain
	in quality resulting from combining some of the most widely used
	prediction methods with our {SVM}s rather than with the ensemble
	methods traditionally used in the field. {T}he gain increases when
	the outputs of the combiners are post-processed with a {DP} algorithm.},
  doi = {10.1007/s100440200015},
  pdf = {../local/Guermeur2002Combining.pdf},
  file = {Guermeur2002Combining.pdf:local/Guermeur2002Combining.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1007/s100440200015}
}
@incollection{Guermeur2004kernel,
  author = {Guermeur, Y. and Lifschitz, A. and Vert, R.},
  title = {A kernel for protein secondary structure prediction},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {193-206},
  keywords = {biosvm},
  owner = {vert}
}
@article{Guermeur2004Combining,
  author = {Guermeur, Y. and Pollastri, G. and Elisseeff, A. and Zelus, D. and
	Paugam-Moisy, H. and Baldi, P.},
  title = {Combining protein secondary structure prediction models with ensemble
	methods of optimal complexity},
  journal = {Neurocomputing},
  year = {2004},
  volume = {56},
  pages = {305-327},
  abstract = {Many sophisticated methods are currently available to perform protein
	secondary structure prediction. {S}ince they are frequently based
	on different principles, and different knowledge sources, significant
	benefits can be expected from combining them. {H}owever, the choice
	of an appropriate combiner appears to be an issue in its own right.
	{T}he first difficulty to overcome when combining prediction methods
	is overfitting. {T}his is the reason why we investigate the implementation
	of {S}upport {V}ector {M}achines to perform the task. {A} family
	of multi-class {SVM}s is introduced. {T}wo of these machines are
	used to combine some of the current best protein secondary structure
	prediction methods. {T}heir performance is consistently superior
	to the performance of the ensemble methods traditionally used in
	the field. {T}hey also outperform the decomposition approaches based
	on bi-class {SVM}s. {F}urthermore, initial experimental evidence
	suggests that their outputs could be processed by the biologist to
	perform higher-level treatments.},
  doi = {10.1016/j.neucom.2003.10.004},
  pdf = {../local/Guermeur2004Combining.pdf},
  file = {Guermeur2004Combining.pdf:local/Guermeur2004Combining.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.neucom.2003.10.004}
}
@article{Guiot2007Morphological,
  author = {Caterina Guiot and Pier P Delsanto and Thomas S Deisboeck},
  title = {Morphological instability and cancer invasion: a `splashing water
	drop' analogy.},
  journal = {Theor. Biol. Med. Model.},
  year = {2007},
  volume = {4},
  pages = {4},
  abstract = {BACKGROUND: Tissue invasion, one of the hallmarks of cancer, is a
	major clinical problem. Recent studies suggest that the process of
	invasion is driven at least in part by a set of physical forces that
	may be susceptible to mathematical modelling which could have practical
	clinical value. MODEL AND CONCLUSION: We present an analogy between
	two unrelated instabilities. One is caused by the impact of a drop
	of water on a solid surface while the other concerns a tumor that
	develops invasive cellular branches into the surrounding host tissue.
	In spite of the apparent abstractness of the idea, it yields a very
	practical result, i.e. an index that predicts tumor invasion based
	on a few measurable parameters. We discuss its application in the
	context of experimental data and suggest potential clinical implications.},
  doi = {10.1186/1742-4682-4-4},
  institution = {Dip. Neuroscience and CNISM, Università di Torino, Italy. caterina.guiot@unito.it},
  keywords = {Animals; Biomechanics; Cell Adhesion; Humans; Mathematics; Models,
	Biological; Neoplasm Invasiveness; Neoplasms, pathology; Surface
	Tension},
  language = {eng},
  medline-pst = {epublish},
  owner = {philippe},
  pii = {1742-4682-4-4},
  pmid = {17254360},
  timestamp = {2011.07.15},
  url = {http://dx.doi.org/10.1186/1742-4682-4-4}
}
@article{Gunderson2004Decoding,
  author = {Kevin L Gunderson and Semyon Kruglyak and Michael S Graige and Francisco
	Garcia and Bahram G Kermani and Chanfeng Zhao and Diping Che and
	Todd Dickinson and Eliza Wickham and Jim Bierle and Dennis Doucet
	and Monika Milewski and Robert Yang and Chris Siegmund and Juergen
	Haas and Lixin Zhou and Arnold Oliphant and Jian-Bing Fan and Steven
	Barnard and Mark S Chee},
  title = {Decoding randomly ordered DNA arrays.},
  journal = {Genome Res},
  year = {2004},
  volume = {14},
  pages = {870--877},
  number = {5},
  month = {May},
  abstract = {We have developed a simple and efficient algorithm to identify each
	member of a large collection of DNA-linked objects through the use
	of hybridization, and have applied it to the manufacture of randomly
	assembled arrays of beads in wells. Once the algorithm has been used
	to determine the identity of each bead, the microarray can be used
	in a wide variety of applications, including single nucleotide polymorphism
	genotyping and gene expression profiling. The algorithm requires
	only a few labels and several sequential hybridizations to identify
	thousands of different DNA sequences with great accuracy. We have
	decoded tens of thousands of arrays, each with 1520 sequences represented
	at approximately 30-fold redundancy by up to approximately 50,000
	beads, with a median error rate of <1 x 10(-4) per bead. The approach
	makes use of error checking codes and provides, for the first time,
	a direct functional quality control of every element of each array
	that is manufactured. The algorithm can be applied to any spatially
	fixed collection of objects or molecules that are associated with
	specific DNA sequences.},
  doi = {10.1101/gr.2255804},
  institution = {Illumina, Inc., San Diego, California 92121, USA.},
  keywords = {Algorithms; Computational Biology, methods; Oligonucleotide Array
	Sequence Analysis, methods/trends; Random Allocation; Research Design;
	Sequence Analysis, DNA, methods; Silicon Dioxide, chemistry},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {2255804},
  pmid = {15078854},
  timestamp = {2010.08.04},
  url = {http://dx.doi.org/10.1101/gr.2255804}
}
@article{Guo2004novel,
  author = {Guo, J. and Chen, H. and Sun, Z. and Lin, Y.},
  title = {A novel method for protein secondary structure prediction using dual-layer
	{SVM} and profiles},
  journal = {Proteins},
  year = {2004},
  volume = {54},
  pages = {738-743},
  number = {4},
  abstract = {A high-performance method was developed for protein secondary structure
	prediction based on the dual-layer support vector machine ({SVM})
	and position-specific scoring matrices ({PSSM}s). {SVM} is a new
	machine learning technology that has been successfully applied in
	solving problems in the field of bioinformatics. {T}he {SVM}'s performance
	is usually better than that of traditional machine learning approaches.
	{T}he performance was further improved by combining {PSSM} profiles
	with the {SVM} analysis. {T}he {PSSM}s were generated from {PSI}-{BLAST}
	profiles, which contain important evolution information. {T}he final
	prediction results were generated from the second {SVM} layer output.
	{O}n the {CB}513 data set, the three-state overall per-residue accuracy,
	{Q}3, reached 75.2%, while segment overlap ({SOV}) accuracy increased
	to 80.0%. {O}n the {CB}396 data set, the {Q}3 of our method reached
	74.0% and the {SOV} reached 78.1%. {A} web server utilizing the method
	has been constructed and is available at http://www.bioinfo.tsinghua.edu.cn/pmsvm.},
  doi = {10.1002/prot.10634 },
  pdf = {../local/Guo2004novel.pdf},
  file = {Guo2004novel.pdf:local/Guo2004novel.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.10634 }
}
@article{Guo2005novel,
  author = {Ting Guo and Yanxin Shi and Zhirong Sun},
  title = {A novel statistical ligand-binding site predictor: application to
	{ATP}-binding sites.},
  journal = {Protein {E}ng {D}es {S}el},
  year = {2005},
  volume = {18},
  pages = {65-70},
  number = {2},
  month = {Feb},
  abstract = {Structural genomics initiatives are leading to rapid growth in newly
	determined protein 3{D} structures, the functional characterization
	of which may still be inadequate. {A}s an attempt to provide insights
	into the possible roles of the emerging proteins whose structures
	are available and/or to complement biochemical research, a variety
	of computational methods have been developed for the screening and
	prediction of ligand-binding sites in raw structural data, including
	statistical pattern classification techniques. {I}n this paper, we
	report a novel statistical descriptor (the {O}riented {S}hell {M}odel)
	for protein ligand-binding sites, which utilizes the distance and
	angular position distribution of various structural and physicochemical
	features present in immediate proximity to the center of a binding
	site. {U}sing the support vector machine ({SVM}) as the classifier,
	our model identified 69\% of the {ATP}-binding sites in whole-protein
	scanning tests and in eukaryotic proteins the accuracy is particularly
	high. {W}e propose that this feature extraction and machine learning
	procedure can screen out ligand-binding-capable protein candidates
	and can yield valuable biochemical information for individual proteins.},
  doi = {10.1093/protein/gzi006},
  pdf = {../local/Guo2005novel.pdf},
  file = {Guo2005novel.pdf:local/Guo2005novel.pdf:PDF},
  keywords = {biosvm},
  pii = {gzi006},
  url = {http://dx.doi.org/10.1093/protein/gzi006}
}
@article{Gururaja2003Multiple,
  author = {Gururaja, T. and Li, W. and Noble, W.S. and Payan, D.G. and Anderson,
	D.C.},
  title = {Multiple functional categories of proteins identified in an in vitro
	cellular ubiquitin affinity extract using shotgun peptide sequencing},
  journal = {J {P}roteome {R}es},
  year = {2003},
  volume = {2},
  pages = {394-404},
  number = {394-404},
  abstract = {Using endogenous human cellular ubiquitin system enzymes and added
	his-tagged ubiquitin, {ATP}, and an {ATP}-regenerating system, we
	labelled cellular proteins with hexahistidine tagged ubiquitin in
	vitro. {L}abeling was dependent on {ATP} and the {ATP} recycling
	system, on the proteasome inhibitor {MG}132 and the ubiquitin protease
	inhibitor ubiquitin aldehyde, and was inhibited by iodoacetamide.
	{L}abeled proteins were affinity extracted in quadruplicate and tryptic
	peptides identifed by 2{D} capillary {LC}/{MS}/{MS} comb9ined with
	{SEQUEST} and {MEDUSA} analyses. {S}upport vector machine analyais
	of the mass spectrometry data allowed prediction of correct matches
	between mass spectrometry data and peptide sequences. {O}verall,
	144 proteins were identified by peptides predicted to be correctly
	sequenced, and 113 were identified by at least three peptides or
	one or two peptides with at least an 80% chance of being correct.
	{I}dentified proteins included 22 proteasome subunits or associated
	proteins, 18 {E}1, {E}2 or {E}3 ubiquitin system enzymes or related
	proteins, and four ubiquitin domain proteins. {S}eventeen directly
	ubiquitinated proteins or proteins associated with the ubiquitin
	system were identified. {F}unctional clusters of other proteins included
	redox enzymes, proteins associated with endocytosis, cytoskeletal
	proteins, {DNA} damage or repair related proteins, calcium binding
	proteins, and splicing factor and related proteins, suggesting that
	in vitro ubiquitination is not random, and that these functions may
	be regulated by the ubiquitin system. {T}his map of cellular ubiquitinated
	proteins and their interacting proteins will be useful for further
	studies of ubiquitin system function.},
  pdf = {../local/Gururaja2003Multiple.pdf},
  file = {Gururaja2003Multiple.pdf:local/Gururaja2003Multiple.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Guyon2002Gene,
  author = {Guyon, I. and Weston, J. and Barnhill, S. and Vapnik, V.},
  title = {Gene selection for cancer classification using support vector machines},
  journal = {Mach. Learn.},
  year = {2002},
  volume = {46},
  pages = {389-422},
  number = {1/3},
  month = {Jan},
  abstract = {D{NA} micro-arrays now permit scientists to screen thousands of genes
	simultaneously and determine whether those genes are active, hyperactive
	or silent in normal or cancerous tissue. {B}ecause these new micro-array
	devices generate bewildering amounts of raw data, new analytical
	methods must be developed to sort out whether cancer tissues have
	distinctive signatures of gene expression over normal tissues or
	other types of cancer tissues. {I}n this paper, we address the problem
	of selection of a small subset of genes from broad patterns of gene
	expression data, recorded on {DNA} micro-arrays. {U}sing available
	training examples from cancer and normal patients, we build a classifier
	suitable for genetic diagnosis, as well as drug discovery. {P}revious
	attempts to address this problem select genes with correlation techniques.
	{W}e propose a new method of gene selection utilizing {S}upport {V}ector
	{M}achine methods based on {R}ecursive {F}eature {E}limination ({RFE}).
	{W}e demonstrate experimentally that the genes selected by our techniques
	yield better classification performance and are biologically relevant
	to cancer. {I}n contrast with the baseline method, our method eliminates
	gene redundancy automatically and yields better and more compact
	gene subsets. {I}n patients with leukemia our method discovered 2
	genes that yield zero leave-one-out error, while 64 genes are necessary
	for the baseline method to get the best result (one leave-one-out
	error). {I}n the colon cancer database, using only 4 genes our method
	is 98% accurate, while the baseline method is only 86% accurate.},
  pdf = {../local/Guyon2002Gene.pdf},
  file = {Guyon2002Gene.pdf:local/Guyon2002Gene.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://homepages.nyu.edu/~jaw281/genesel.pdf}
}
@article{Gartner2004Kernels,
  author = {G{\"a}rtner, T. and Lloyd, J.W. and Flach, P.A.},
  title = {Kernels and Distances for Structured Data},
  journal = {Mach. Learn.},
  year = {2004},
  volume = {57},
  pages = {205-232},
  number = {3},
  abstract = {This paper brings together two strands of machine learning of increasing
	importance: kernel methods and highly structured data. We propose
	a general method for constructing a kernel following the syntactic
	structure of the data, as defined by its type signature in a higher-order
	logic. Our main theoretical result is the positive definiteness of
	any kernel thus defined. We report encouraging experimental results
	on a range of real-world data sets. By converting our kernel to a
	distance pseudo-metric for 1-nearest neighbour, we were able to improve
	the best accuracy from the literature on the Diterpene data set by
	more than 10%.},
  doi = {10.1023/B:MACH.0000039777.23772.30},
  keywords = {biosvm},
  timestamp = {2006.07.11},
  url = {http://dx.doi.org/10.1023/B:MACH.0000039777.23772.30}
}
@article{Haasdonk2005Feature,
  author = {Bernard Haasdonk},
  title = {Feature space interpretation of {SVM}s with indefinite kernels.},
  journal = {I{EEE} {T}rans {P}attern {A}nal {M}ach {I}ntell},
  year = {2005},
  volume = {27},
  pages = {482-92},
  number = {4},
  month = {Apr},
  abstract = {Kernel methods are becoming increasingly popular for various kinds
	of machine learning tasks, the most famous being the support vector
	machine ({SVM}) for classification. {T}he {SVM} is well understood
	when using conditionally positive definite (cpd) kernel functions.
	{H}owever, in practice, non-cpd kernels arise and demand application
	in {SVM}s. {T}he procedure of "plugging" these indefinite kernels
	in {SVM}s often yields good empirical classification results. {H}owever,
	they are hard to interpret due to missing geometrical and theoretical
	understanding. {I}n this paper, we provide a step toward the comprehension
	of {SVM} classifiers in these situations. {W}e give a geometric interpretation
	of {SVM}s with indefinite kernel functions. {W}e show that such {SVM}s
	are optimal hyperplane classifiers not by margin maximization, but
	by minimization of distances between convex hulls in pseudo-{E}uclidean
	spaces. {B}y this, we obtain a sound framework and motivation for
	indefinite {SVM}s. {T}his interpretation is the basis for further
	theoretical analysis, e.g., investigating uniqueness, and for the
	derivation of practical guidelines like characterizing the suitability
	of indefinite {SVM}s.},
  doi = {10.1109/TPAMI.2005.78},
  pdf = {../local/Haasdonk2005Feature.pdf},
  file = {Haasdonk2005Feature.pdf:local/Haasdonk2005Feature.pdf:PDF},
  keywords = {Algorithms, Animals, Antibiotics, Antineoplastic, Artificial Intelligence,
	Automated, Automatic Data Processing, Butadienes, Chloroplasts, Cluster
	Analysis, Comparative Study, Computer Simulation, Computer-Assisted,
	Computing Methodologies, Database Management Systems, Databases,
	Diagnosis, Disinfectants, Dose-Response Relationship, Drug, Drug
	Toxicity, Electrodes, Electroencephalography, Ethylamines, Expert
	Systems, Factual, Feedback, Fungicides, Gene Expression Profiling,
	Genes, Genetic Markers, Humans, Image Enhancement, Image Interpretation,
	Implanted, Industrial, Information Storage and Retrieval, Kidney,
	Kidney Tubules, MEDLINE, Male, Mercuric Chloride, Microarray Analysis,
	Molecular Biology, Motor Cortex, Movement, Natural Language Processing,
	Neural Networks (Computer), Non-P.H.S., Non-U.S. Gov't, Numerical
	Analysis, Pattern Recognition, Plant Proteins, Predictive Value of
	Tests, Proteins, Proteome, Proximal, Puromycin Aminonucleoside, Rats,
	Reproducibility of Results, Research Support, Sensitivity and Specificity,
	Signal Processing, Sprague-Dawley, Subcellular Fractions, Terminology,
	Therapy, Time Factors, Toxicogenetics, U.S. Gov't, User-Computer
	Interface, 15794155},
  url = {http://dx.doi.org/10.1109/TPAMI.2005.78}
}
@article{Haferlach2005AML,
  author = {Torsten Haferlach and Alexander Kohlmann and Susanne Schnittger and
	Martin Dugas and Wolfgang Hiddemann and Wolfgang Kern and Claudia
	Schoch},
  title = {A{ML} {M}3 and {AML} {M}3 variant each have a distinct gene expression
	signature but also share patterns different from other genetically
	defined {AML} subtypes.},
  journal = {Genes {C}hromosomes {C}ancer},
  year = {2005},
  volume = {43},
  pages = {113-27},
  number = {2},
  month = {Jun},
  abstract = {Acute promyelocytic leukemia ({APL}) with t(15;17) appears in two
	phenotypes: {AML} {M}3, with abnormal promyelocytes showing heavy
	granulation and bundles of {A}uer rods, and {AML} {M}3 variant ({M}3v),
	with non- or hypogranular cytoplasm and a bilobed nucleus. {W}e investigated
	the global gene expression profiles of 35 {APL} patients (19 {AML}
	{M}3, 16 {AML} {M}3v) by using high-density {DNA}-oligonucleotide
	microarrays. {F}irst, an unsupervised approach clearly separated
	{APL} samples from other {AML}s characterized genetically as t(8;21)
	(n = 35), inv(16) (n = 35), or t(11q23)/{MLL} (n = 35) or as having
	a normal karyotype (n = 50). {S}econd, we found genes with functional
	relevance for blood coagulation that were differentially expressed
	between {APL} and other {AML}s. {F}urthermore, a supervised pairwise
	comparison between {M}3 and {M}3v revealed differential expression
	of genes that encode for biological functions and pathways such as
	granulation and maturation of hematologic cells, explaining morphologic
	and clinical differences. {D}iscrimination between {M}3 and {M}3v
	based on gene signatures showed a median classification accuracy
	of 90\% by use of 10-fold {CV} and support vector machines. {A}dditional
	molecular mutations such as {FLT}3-{LM}, which were significantly
	more frequent in {M}3v than in {M}3 ({P} < 0.0001), may partly contribute
	to the different phenotypes. {H}owever, linear regression analysis
	demonstrated that genes differentially expressed between {M}3 and
	{M}3v did not correlate with {FLT}3-{LM}.},
  doi = {10.1002/gcc.20175},
  pdf = {../local/Haferlach2005AML.pdf},
  file = {Haferlach2005AML.pdf:local/Haferlach2005AML.pdf:PDF},
  keywords = {biosvm microarray},
  url = {http://dx.doi.org/10.1002/gcc.20175}
}
@article{Haferlach2005global,
  author = {Torsten Haferlach and Alexander Kohlmann and Susanne Schnittger and
	Martin Dugas and Wolfgang Hiddemann and Wolfgang Kern and Claudia
	Schoch},
  title = {A global approach to the diagnosis of leukemia using gene expression
	profiling.},
  journal = {Blood},
  year = {2005},
  volume = {106},
  pages = {1189-1198},
  number = {4},
  month = {Aug},
  abstract = {Accurate diagnosis and classification of leukemias are the bases for
	the appropriate management of patients. {T}he diagnostic accuracy
	and efficiency of present methods may be improved by the use of microarrays
	for gene expression profiling. {W}e analyzed gene expression profiles
	in bone marrow and peripheral blood samples from 937 patients with
	all clinically relevant leukemia subtypes (n=892) and non-leukemic
	controls (n=45) by {U}133{A} and {B} {G}ene{C}hips ({A}ffymetrix).
	{F}or each subgroup differentially expressed genes were calculated.
	{C}lass prediction was performed using support vector machines. {P}rediction
	accuracies were estimated by 10-fold cross validation and assessed
	for robustness in a 100-fold resampling approach using randomly chosen
	test-sets consisting of 1/3 of the samples. {A}pplying the top 100
	genes of each subgroup an overall prediction accuracy of 95.1\% was
	achieved which was confirmed by resampling (median, 93.8\%; 95\%
	confidence interval, 91.4\%-95.8\%). {I}n particular, {AML} with
	t(15;17), t(8;21), or inv(16), {CLL}, and {P}ro-{B}-{ALL} with t(11q23)
	were classified with 100\% sensitivity and 100\% specificity. {A}ccordingly,
	cluster analysis completely separated all of the 13 subgroups analyzed.
	{G}ene expression profiling can predict all clinically relevant subentities
	of leukemia with high accuracy.},
  doi = {10.1182/blood-2004-12-4938},
  pdf = {../local/Haferlach2005global.pdf},
  file = {Haferlach2005global.pdf:local/Haferlach2005global.pdf:PDF},
  keywords = {biosvm microarray},
  pii = {2004-12-4938},
  url = {http://dx.doi.org/10.1182/blood-2004-12-4938}
}
@article{Hakenberg2005Systematic,
  author = {Jörg Hakenberg and Steffen Bickel and Conrad Plake and Ulf Brefeld
	and Hagen Zahn and Lukas Faulstich and Ulf Leser and Tobias Scheffer},
  title = {Systematic feature evaluation for gene name recognition.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6 Suppl 1},
  pages = {S9},
  abstract = {In task 1{A} of the {B}io{C}re{A}t{I}v{E} evaluation, systems had
	to be devised that recognize words and phrases forming gene or protein
	names in natural language sentences. {W}e approach this problem by
	building a word classification system based on a sliding window approach
	with a {S}upport {V}ector {M}achine, combined with a pattern-based
	post-processing for the recognition of phrases. {T}he performance
	of such a system crucially depends on the type of features chosen
	for consideration by the classification method, such as pre- or postfixes,
	character n-grams, patterns of capitalization, or classification
	of preceding or following words. {W}e present a systematic approach
	to evaluate the performance of different feature sets based on recursive
	feature elimination, {RFE}. {B}ased on a systematic reduction of
	the number of features used by the system, we can quantify the impact
	of different feature sets on the results of the word classification
	problem. {T}his helps us to identify descriptive features, to learn
	about the structure of the problem, and to design systems that are
	faster and easier to understand. {W}e observe that the {SVM} is robust
	to redundant features. {RFE} improves the performance by 0.7\%, compared
	to using the complete set of attributes. {M}oreover, a performance
	that is only 2.3\% below this maximum can be obtained using fewer
	than 5\% of the features.},
  doi = {10.1186/1471-2105-6-S1-S9},
  pdf = {../local/Hakenberg2005Systematic.pdf},
  file = {Hakenberg2005Systematic.pdf:local/Hakenberg2005Systematic.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-S1-S9},
  url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S9}
}
@article{Hakenberg2004Finding,
  author = {Hakenberg, J. and Schmeier ,S. and Kowald, A. and Klipp, E. and Leser,
	U.},
  title = {Finding kinetic parameters using text mining.},
  journal = {O{MICS}},
  year = {2004},
  volume = {8},
  pages = {131-152},
  number = {2},
  abstract = {The mathematical modeling and description of complex biological processes
	has become more and more important over the last years. {S}ystems
	biology aims at the computational simulation of complex systems,
	up to whole cell simulations. {A}n essential part focuses on solving
	a large number of parameterized differential equations. {H}owever,
	measuring those parameters is an expensive task, and finding them
	in the literature is very laborious. {W}e developed a text mining
	system that supports researchers in their search for experimentally
	obtained parameters for kinetic models. {O}ur system classifies full
	text documents regarding the question whether or not they contain
	appropriate data using a support vector machine. {W}e evaluated our
	approach on a manually tagged corpus of 800 documents and found that
	it outperforms keyword searches in abstracts by a factor of five
	in terms of precision.},
  pdf = {../local/Hakenberg2004Finding.pdf},
  file = {Hakenberg2004Finding.pdf:local/Hakenberg2004Finding.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.liebertonline.com/doi/abs/10.1089%2F1536231041388366}
}
@article{Han2004Predicting,
  author = {Han, L.Y. and Cai, C.Z. and Ji, Z.L. and Cao, Z.W. and Cui, J. and
	Chen, Y.Z.},
  title = {Predicting functional family of novel enzymes irrespective of sequence
	similarity: a statistical learning approach.},
  journal = {Nucl. {A}cids {R}es.},
  year = {2004},
  volume = {32},
  pages = {6437-6444},
  number = {21},
  abstract = {The function of a protein that has no sequence homolog of known function
	is difficult to assign on the basis of sequence similarity. {T}he
	same problem may arise for homologous proteins of different functions
	if one is newly discovered and the other is the only known protein
	of similar sequence. {I}t is desirable to explore methods that are
	not based on sequence similarity. {O}ne approach is to assign functional
	family of a protein to provide useful hint about its function. {S}everal
	groups have employed a statistical learning method, support vector
	machines ({SVM}s), for predicting protein functional family directly
	from sequence irrespective of sequence similarity. {T}hese studies
	showed that {SVM} prediction accuracy is at a level useful for functional
	family assignment. {B}ut its capability for assignment of distantly
	related proteins and homologous proteins of different functions has
	not been critically and adequately assessed. {H}ere {SVM} is tested
	for functional family assignment of two groups of enzymes. {O}ne
	consists of 50 enzymes that have no homolog of known function from
	{PSI}-{BLAST} search of protein databases. {T}he other contains eight
	pairs of homologous enzymes of different families. {SVM} correctly
	assigns 72% of the enzymes in the first group and 62% of the enzyme
	pairs in the second group, suggesting that it is potentially useful
	for facilitating functional study of novel proteins. {A} web version
	of our software, {SVMP}rot, is accessible at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi.},
  doi = {10.1093/nar/gkh984},
  pdf = {../local/Han2004Predicting.pdf},
  file = {Han2004Predicting.pdf:local/Han2004Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/nar/gkh984}
}
@article{Han2005Prediction,
  author = {Han, L.Y. and Cai, C.Z. and Ji, Z.L. and Chen, Y.Z.},
  title = {Prediction of functional class of novel viral proteins by a statistical
	learning method irrespective of sequence similarity},
  journal = {Virology},
  year = {2005},
  volume = {331},
  pages = {136-143},
  number = {1},
  abstract = {The function of a substantial percentage of the putative protein-coding
	open reading frames ({ORF}s) in viral genomes is unknown. {A}s their
	sequence is not similar to that of proteins of known function, the
	function of these {ORF}s cannot be assigned on the basis of sequence
	similarity. {M}ethods complement or in combination with sequence
	similarity-based approaches are being explored. {T}he web-based software
	{SVMP}rot () to some extent assigns protein functional family irrespective
	of sequence similarity and has been found to be useful for studying
	distantly related proteins [{C}ai, {C}.{Z}., {H}an, {L}.{Y}., {J}i,
	{Z}.{L}., {C}hen, {X}., {C}hen, {Y}.{Z}., 2003. {SVM}-{P}rot: web-based
	support vector machine software for functional classification of
	a protein from its primary sequence. {N}ucleic {A}cids {R}es. 31(13):
	3692-3697]. {H}ere 25 novel viral proteins are selected to test the
	capability of {SVMP}rot for functional family assignment of viral
	proteins whose function cannot be confidently predicted on by sequence
	similarity methods at present. {T}hese proteins are without a sequence
	homolog in the {S}wissprot database, with its precise function provided
	in the literature, and not included in the training sets of {SVMP}rot.
	{T}he predicted functional classes of 72% of these proteins match
	the literature-described function, which is compared to the overall
	accuracy of 87% for {SVMP}rot functional class assignment of 34582
	proteins. {T}his suggests that {SVMP}rot to some extent is capable
	of functional class assignment irrespective of sequence similarity
	and it is potentially useful for facilitating functional study of
	novel viral proteins.},
  doi = {10.1016/j.virol.2004.10.020},
  pdf = {../local/Han2005Prediction.pdf},
  file = {Han2005Prediction.pdf:local/Han2005Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.virol.2004.10.020}
}
@article{Han2004Prediction,
  author = {Han, L.Y. and Cai, C.Z. and Lo, S.L. and Chung, M.C. and Chen, Y.Z.},
  title = {Prediction of {RNA}-binding proteins from primary sequence by a support
	vector machine approach.},
  journal = {R{NA}},
  year = {2004},
  volume = {10},
  pages = {355-368},
  number = {3},
  abstract = {Elucidation of the interaction of proteins with different molecules
	is of significance in the understanding of cellular processes. {C}omputational
	methods have been developed for the prediction of protein-protein
	interactions. {B}ut insufficient attention has been paid to the prediction
	of protein-{RNA} interactions, which play central roles in regulating
	gene expression and certain {RNA}-mediated enzymatic processes. {T}his
	work explored the use of a machine learning method, support vector
	machines ({SVM}), for the prediction of {RNA}-binding proteins directly
	from their primary sequence. {B}ased on the knowledge of known {RNA}-binding
	and non-{RNA}-binding proteins, an {SVM} system was trained to recognize
	{RNA}-binding proteins. {A} total of 4011 {RNA}-binding and 9781
	non-{RNA}-binding proteins was used to train and test the {SVM} classification
	system, and an independent set of 447 {RNA}-binding and 4881 non-{RNA}-binding
	proteins was used to evaluate the classification accuracy. {T}esting
	results using this independent evaluation set show a prediction accuracy
	of 94.1%, 79.3%, and 94.1% for r{RNA}-, m{RNA}-, and t{RNA}-binding
	proteins, and 98.7%, 96.5%, and 99.9% for non-r{RNA}-, non-m{RNA}-,
	and non-t{RNA}-binding proteins, respectively. {T}he {SVM} classification
	system was further tested on a small class of sn{RNA}-binding proteins
	with only 60 available sequences. {T}he prediction accuracy is 40.0%
	and 99.9% for sn{RNA}-binding and non-sn{RNA}-binding proteins, indicating
	a need for a sufficient number of proteins to train {SVM}. {T}he
	{SVM} classification systems trained in this work were added to our
	{W}eb-based protein functional classification software {SVMP}rot,
	at http://jing.cz3.nus.edu.sg/cgi-bin/svmprot.cgi. {O}ur study suggests
	the potential of {SVM} as a useful tool for facilitating the prediction
	of protein-{RNA} interactions.},
  pdf = {../local/Han2004Prediction.pdf},
  file = {Han2004Prediction.pdf:local/Han2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.rnajournal.org/cgi/content/abstract/10/3/355}
}
@article{Han2005Fold,
  author = {Sangjo Han and Byung-Chul Lee and Seung Taek Yu and Chan-Seok Jeong
	and Soyoung Lee and Dongsup Kim},
  title = {Fold recognition by combining profile-profile alignment and support
	vector machine.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2667-73},
  number = {11},
  month = {Jun},
  abstract = {M{OTIVATION}: {C}urrently, the most accurate fold-recognition method
	is to perform profile-profile alignments and estimate the statistical
	significances of those alignments by calculating {Z}-score or {E}-value.
	{A}lthough this scheme is reliable in recognizing relatively close
	homologs related at the family level, it has difficulty in finding
	the remote homologs that are related at the superfamily or fold level.
	{RESULTS}: {I}n this paper, we present an alternative method to estimate
	the significance of the alignments. {T}he alignment between a query
	protein and a template of length n in the fold library is transformed
	into a feature vector of length n + 1, which is then evaluated by
	support vector machine ({SVM}). {T}he output from {SVM} is converted
	to a posterior probability that a query sequence is related to a
	template, given {SVM} output. {R}esults show that a new method shows
	significantly better performance than {PSI}-{BLAST} and profile-profile
	alignment with {Z}-score scheme. {W}hile {PSI}-{BLAST} and {Z}-score
	scheme detect 16 and 20\% of superfamily-related proteins, respectively,
	at 90\% specificity, a new method detects 46\% of these proteins,
	resulting in more than 2-fold increase in sensitivity. {M}ore significantly,
	at the fold level, a new method can detect 14\% of remotely related
	proteins at 90\% specificity, a remarkable result considering the
	fact that the other methods can detect almost none at the same level
	of specificity.},
  doi = {10.1093/bioinformatics/bti384},
  pdf = {../local/Han2005Fold.pdf},
  file = {Han2005Fold.pdf:local/Han2005Fold.pdf:PDF},
  keywords = {biosvm},
  pii = {bti384},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti384}
}
@article{Hanisch2002Co-clustering,
  author = {D. Hanisch and A. Zien and R. Zimmer and T. Lengauer},
  title = {Co-clustering of biological networks and gene expression data},
  journal = {Bioinformatics},
  year = {2002},
  annote = {To appear},
  subject = {microarraybionet},
  url = {http://cartan.gmd.de/~hanisch/paper/CoClustering.pdf}
}
@article{Harborth2003Sequence,
  author = {Harborth, J. and Elbashir, S. M. and Vandenburgh, K. and Manninga,
	H. and Scaringe, S. A. and Weber, K. and Tuschl, T.},
  title = {Sequence, chemical, and structural variation of small interfering
	{RNA}s and short hairpin {RNA}s and the effect on mammalian gene
	silencing.},
  journal = {Antisense {N}ucleic {A}cid. {D}rug. {D}ev.},
  year = {2003},
  volume = {13},
  pages = {83-105},
  number = {2},
  month = {Apr},
  abstract = {Small interfering {RNA}s (si{RNA}s) induce sequence-specific gene
	silencing in mammalian cells and guide m{RNA} degradation in the
	process of {RNA} interference ({RNA}i). {B}y targeting endogenous
	lamin {A}/{C} m{RNA} in human {H}e{L}a or mouse {SW}3{T}3 cells,
	we investigated the positional variation of si{RNA}-mediated gene
	silencing. {W}e find cell-type-dependent global effects and cell-type-independent
	positional effects. {H}e{L}a cells were about 2-fold more responsive
	to si{RNA}s than {SW}3{T}3 cells but displayed a very similar pattern
	of positional variation of lamin {A}/{C} silencing. {I}n {H}e{L}a
	cells, 26 of 44 tested standard 21-nucleotide (nt) si{RNA} duplexes
	reduced the protein expression by at least 90\%, and only 2 duplexes
	reduced the lamin {A}/{C} proteins to <50\%. {F}luorescent chromophores
	did not perturb gene silencing when conjugated to the 5'-end or 3'-end
	of the sense si{RNA} strand and the 5'-end of the antisense si{RNA}
	strand, but conjugation to the 3'-end of the antisense si{RNA} abolished
	gene silencing. {RN}ase-protecting phosphorothioate and 2'-fluoropyrimidine
	{RNA} backbone modifications of si{RNA}s did not significantly affect
	silencing efficiency, although cytotoxic effects were observed when
	every second phosphate of an si{RNA} duplex was replaced by phosphorothioate.
	{S}ynthetic {RNA} hairpin loops were subsequently evaluated for lamin
	{A}/{C} silencing as a function of stem length and loop composition.
	{A}s long as the 5'-end of the guide strand coincided with the 5'-end
	of the hairpin {RNA}, 19-29 base pair (bp) hairpins effectively silenced
	lamin {A}/{C}, but when the hairpin started with the 5'-end of the
	sense strand, only 21-29 bp hairpins were highly active.},
  doi = {10.1089/108729003321629638},
  keywords = {Adaptor Protein Complex alpha Subunits, Animal, Animals, Antisense,
	Apolipoproteins B, Base Sequence, Biological Transport, Blotting,
	Catalytic, Cell Line, Cell Membrane, Cell Survival, Chemical, Cholesterol,
	Clathrin, Clathrin Heavy Chains, Disease Models, Endocytosis, Epidermal
	Growth Factor, Fluorescence, Gene Expression Profiling, Gene Silencing,
	Gene Therapy, Hela Cells, Humans, Injections, Intravenous, Jejunum,
	Kinetics, Lamin Type A, Liver, Messenger, Metabolic Syndrome X, Mice,
	Microscopy, Models, Molecular Sequence Data, NIH 3T3 Cells, Non-U.S.
	Gov't, Nucleic Acid, Oligonucleotides, Open Reading Frames, Post-Transcriptional,
	Protein Isoforms, Pyrimidines, RNA, RNA Interference, RNA Processing,
	RNA Stability, Research Support, Reverse Transcriptase Polymerase
	Chain Reaction, Sensitivity and Specificity, Sequence Homology, Small
	Interfering, Subcellular Fractions, Swiss 3T3 Cells, Thionucleotides,
	Time Factors, Transfection, Transferrin, Transgenic, Tumor, Western,
	12804036},
  url = {http://dx.doi.org/10.1089/108729003321629638}
}
@article{Harris2008Single-molecule,
  author = {Timothy D Harris and Phillip R Buzby and Hazen Babcock and Eric Beer
	and Jayson Bowers and Ido Braslavsky and Marie Causey and Jennifer
	Colonell and James Dimeo and J. William Efcavitch and Eldar Giladi
	and Jaime Gill and John Healy and Mirna Jarosz and Dan Lapen and
	Keith Moulton and Stephen R Quake and Kathleen Steinmann and Edward
	Thayer and Anastasia Tyurina and Rebecca Ward and Howard Weiss and
	Zheng Xie},
  title = {Single-molecule DNA sequencing of a viral genome.},
  journal = {Science},
  year = {2008},
  volume = {320},
  pages = {106--109},
  number = {5872},
  month = {Apr},
  abstract = {The full promise of human genomics will be realized only when the
	genomes of thousands of individuals can be sequenced for comparative
	analysis. A reference sequence enables the use of short read length.
	We report an amplification-free method for determining the nucleotide
	sequence of more than 280,000 individual DNA molecules simultaneously.
	A DNA polymerase adds labeled nucleotides to surface-immobilized
	primer-template duplexes in stepwise fashion, and the asynchronous
	growth of individual DNA molecules was monitored by fluorescence
	imaging. Read lengths of >25 bases and equivalent phred software
	program quality scores approaching 30 were achieved. We used this
	method to sequence the M13 virus to an average depth of >150x and
	with 100\% coverage; thus, we resequenced the M13 genome with high-sensitivity
	mutation detection. This demonstrates a strategy for high-throughput
	low-cost resequencing.},
  doi = {10.1126/science.1150427},
  institution = {Helicos BioSciences Corporation, One Kendall Square, Cambridge, MA
	02139, USA. tharris@helicosbio.com},
  keywords = {Algorithms; Bacteriophage M13; Computational Biology; DNA Primers;
	DNA, Viral; Genome, Viral; Mutation; Sequence Alignment; Sequence
	Analysis, DNA; Software; Templates, Genetic},
  owner = {phupe},
  pii = {320/5872/106},
  pmid = {18388294},
  timestamp = {2010.08.24},
  url = {http://dx.doi.org/10.1126/science.1150427}
}
@article{Hartwell1999a,
  author = {L. H. Hartwell and J. J. Hopfield and S. Leibler and A. W. Murray},
  title = {From molecular to modular cell biology.},
  journal = {Nature},
  year = {1999},
  volume = {402},
  pages = {C47--C52},
  number = {6761 Suppl},
  month = {Dec},
  abstract = {Cellular functions, such as signal transmission, are carried out by
	'modules' made up of many species of interacting molecules. Understanding
	how modules work has depended on combining phenomenological analysis
	with molecular studies. General principles that govern the structure
	and behaviour of modules may be discovered with help from synthetic
	sciences such as engineering and computer science, from stronger
	interactions between experiment and theory in cell biology, and from
	an appreciation of evolutionary constraints.},
  doi = {10.1038/35011540},
  institution = {Fred Hutchinson Cancer Center, Seattle, Washington 98109, USA.},
  keywords = {Action Potentials; Biological Evolution; Forecasting; Models, Biological;
	Molecular Biology, trends},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pmid = {10591225},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1038/35011540}
}
@techreport{Haussler1999Convolution,
  author = {Haussler, D.},
  title = {Convolution {K}ernels on {D}iscrete {S}tructures},
  institution = {UC Santa Cruz},
  year = {1999},
  number = {UCSC-CRL-99-10},
  abstract = {We introduce a new method of constructing kernels on sets whose elements
	are discrete structures like strings, trees and graphs. {T}he method
	can be applied iteratively to build a kernel on a infinite set from
	kernels involving generators of the set. {T}he family of kernels
	generated generalizes the family of radial basis kernels. {I}t can
	also be used to define kernels in the form of joint {G}ibbs probability
	distributions. {K}ernels can be built from hidden {M}arkov random
	fields, generalized regular expressions, pair-{HMM}s, or {ANOVA}
	decompositions. {U}ses of the method lead to open problems involving
	the theory of infinitely divisible positive definite functions. {F}undamentals
	of this theory and the theory of reproducing kernel {H}ilbert spaces
	are reviewed and applied in establishing the validity of the method.},
  pdf = {../local/Haussler1999Convolution.pdf},
  file = {Haussler1999Convolution.pdf:local/Haussler1999Convolution.pdf:PDF},
  keywords = {biosvm},
  subject = {kernel}
}
@incollection{Heckerman1999tutorial,
  author = {Heckerman, D.},
  title = {A tutorial on learning with {B}ayesian networks},
  booktitle = {Learning in graphical models},
  publisher = {MIT Press},
  year = {1999},
  editor = {Jordan, M.},
  pages = {301--354},
  address = {Cambridge, MA, USA},
  pdf = {../local/Heckerman1999tutorial.pdf},
  file = {Heckerman1999tutorial.pdf:local/Heckerman1999tutorial.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  timestamp = {2006.01.18}
}
@article{Helma2004Data,
  author = {Helma, C. and Cramer, T. and Kramer, S. and De Raedt, L.},
  title = {Data mining and machine learning techniques for the identification
	of mutagenicity inducing substructures and structure activity relationships
	of noncongeneric compounds},
  journal = {J. Chem. Inf. Comput. Sci.},
  year = {2004},
  volume = {44},
  pages = {1402-11},
  number = {4},
  abstract = {This paper explores the utility of data mining and machine learning
	algorithms for the induction of mutagenicity structure-activity relationships
	({SAR}s) from noncongeneric data sets. {W}e compare (i) a newly developed
	algorithm ({MOLFEA}) for the generation of descriptors (molecular
	fragments) for noncongeneric compounds with traditional {SAR} approaches
	(molecular properties) and (ii) different machine learning algorithms
	for the induction of {SAR}s from these descriptors. {I}n addition
	we investigate the optimal parameter settings for these programs
	and give an exemplary interpretation of the derived models. {T}he
	predictive accuracies of models using {MOLFEA} derived descriptors
	is approximately 10-15\%age points higher than those using molecular
	properties alone. {U}sing both types of descriptors together does
	not improve the derived models. {F}rom the applied machine learning
	techniques the rule learner {PART} and support vector machines gave
	the best results, although the differences between the learning algorithms
	are only marginal. {W}e were able to achieve predictive accuracies
	up to 78\% for 10-fold cross-validation. {T}he resulting models are
	relatively easy to interpret and usable for predictive as well as
	for explanatory purposes.},
  doi = {10.1021/ci034254q},
  pdf = {../local/Helma2004Data.pdf},
  file = {Helma2004Data.pdf:local/Helma2004Data.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci034254q}
}
@incollection{Hochreiter2004Gene,
  author = {Hochreiter, S. and Obermayer, K.},
  title = {Gene selection for microarray data},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {319-355},
  pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\},
  file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Hofmann2005Concept-based,
  author = {Oliver Hofmann and Dietmar Schomburg},
  title = {Concept-based annotation of enzyme classes.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2059-66},
  number = {9},
  month = {May},
  abstract = {M{OTIVATION}: {G}iven the explosive growth of biomedical data as well
	as the literature describing results and findings, it is getting
	increasingly difficult to keep up to date with new information. {K}eeping
	databases synchronized with current knowledge is a time-consuming
	and expensive task-one which can be alleviated by automatically gathering
	findings from the literature using linguistic approaches. {W}e describe
	a method to automatically annotate enzyme classes with disease-related
	information extracted from the biomedical literature for inclusion
	in such a database. {RESULTS}: {E}nzyme names for the 3901 enzyme
	classes in the {BRENDA} database, a repository for quantitative and
	qualitative enzyme information, were identified in more than 100,000
	abstracts retrieved from the {P}ub{M}ed literature database. {P}hrases
	in the abstracts were assigned to concepts from the {U}nified {M}edical
	{L}anguage {S}ystem ({UMLS}) utilizing the {M}eta{M}ap program, allowing
	for the identification of disease-related concepts by their semantic
	fields in the {UMLS} ontology. {A}ssignments between enzyme classes
	and diseases were created based on their co-occurrence within a single
	sentence. {F}alse positives could be removed by a variety of filters
	including minimum number of co-occurrences, removal of sentences
	containing a negation and the classification of sentences based on
	their semantic fields by a {S}upport {V}ector {M}achine. {V}erification
	of the assignments with a manually annotated set of 1500 sentences
	yielded favorable results of 92\% precision at 50\% recall, sufficient
	for inclusion in a high-quality database. {AVAILABILITY}: {S}ource
	code is available from the author upon request. {SUPPLEMENTARY} {INFORMATION}:
	ftp.uni-koeln.de/institute/biochemie/pub/brenda/info/disease{S}upp.pdf.},
  doi = {10.1093/bioinformatics/bti284},
  pdf = {../local/Hofmann2005Concept-based.pdf},
  file = {Hofmann2005Concept-based.pdf:local/Hofmann2005Concept-based.pdf:PDF},
  keywords = {biosvm},
  pii = {bti284},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti284}
}
@article{Hou2004Remote,
  author = {Hou, Y. and Hsu, W. and Lee, M. L. and Bystroff, C.},
  title = {Remote homolog detection using local sequence-structure correlations.},
  journal = {Proteins},
  year = {2004},
  volume = {57},
  pages = {518-530},
  number = {3},
  abstract = {Remote homology detection refers to the detection of structural homology
	in proteins when there is little or no sequence similarity. {I}n
	this article, we present a remote homolog detection method called
	{SVM}-{HMMSTR} that overcomes the reliance on detectable sequence
	similarity by transforming the sequences into strings of hidden {M}arkov
	states that represent local folding motif patterns. {T}hese state
	strings are transformed into fixed-dimension feature vectors for
	input to a support vector machine. {T}wo sets of features are defined:
	an order-independent feature set that captures the amino acid and
	local structure composition; and an order-dependent feature set that
	captures the sequential ordering of the local structures. {T}ests
	using the {S}tructural {C}lassification of {P}roteins ({SCOP}) 1.53
	data set show that the {SVM}-{HMMSTR} gives a significant improvement
	over several current methods.},
  doi = {10.1002/prot.20221},
  pdf = {../local/Hou2004Remote.pdf},
  file = {Hou2004Remote.pdf:local/Hou2004Remote.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Hou2003Efficient,
  author = {Hou, Y. and Hsu, W. and Lee, M. L. and Bystroff, C.},
  title = {Efficient remote homology detection using local structure},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {2294-2301},
  number = {17},
  abstract = {Motivation: {T}he function of an unknown biological sequence can often
	be accurately inferred if we are able to map this unknown sequence
	to its corresponding homologous family. {A}t present, discriminative
	methods such as {SVM}-{F}isher and {SVM}-pairwise, which combine
	support vector machine ({SVM}) and sequence similarity, are recognized
	as the most accurate methods, with {SVM}-pairwise being the most
	accurate. {H}owever, these methods typically encode sequence information
	into their feature vectors and ignore the structure information.
	{T}hey are also computationally inefficient. {B}ased on these observations,
	we present an alternative method for {SVM}-based protein classification.
	{O}ur proposed method, {SVM}-{I}-sites, utilizes structure similarity
	for remote homology detection. {R}esult: {W}e run experiments on
	the {S}tructural {C}lassification of {P}roteins 1.53 data set. {T}he
	results show that {SVM}-{I}-sites is more efficient than {SVM}-pairwise.
	{F}urther, we find that {SVM}-{I}-sites outperforms sequence-based
	methods such as {PSI}-{BLAST}, {SAM}, and {SVM}-{F}isher while achieving
	a comparable performance with {SVM}-pairwise. {A}vailability: {I}-sites
	server is accessible through the web at http://www.bioinfo.rpi.edu.
	{P}rograms are available upon request for academics. {L}icensing
	agreements are available for commercial interests. {T}he framework
	of encoding local structure into feature vector is available upon
	request.},
  pdf = {../local/Hou2003Efficient.pdf},
  file = {Hou2003Efficient.pdf:local/Hou2003Efficient.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/17/2294}
}
@article{Hu2004Developing,
  author = {Hu, C. and Li, X. and Liang, J.},
  title = {Developing optimal non-linear scoring function for protein design},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {3080-3098},
  number = {17},
  abstract = {Motivation. {P}rotein design aims to identify sequences compatible
	with a given protein fold but incompatible to any alternative folds.
	{T}o select the correct sequences and to guide the search process,
	a design scoring function is critically important. {S}uch a scoring
	function should be able to characterize the global fitness landscape
	of many proteins simultaneously. {R}esults: {T}o find optimal design
	scoring functions, we introduce two geometric views and propose a
	formulation using a mixture of non-linear {G}aussian kernel functions.
	{W}e aim to solve a simplified protein sequence design problem. {O}ur
	goal is to distinguish each native sequence for a major portion of
	representative protein structures from a large number of alternative
	decoy sequences, each a fragment from proteins of different folds.
	{O}ur scoring function discriminates perfectly a set of 440 native
	proteins from 14 million sequence decoys. {W}e show that no linear
	scoring function can succeed in this task. {I}n a blind test of unrelated
	proteins, our scoring function misclassfies only 13 native proteins
	out of 194. {T}his compares favorably with about three-four times
	more misclassifications when optimal linear functions reported in
	the literature are used. {W}e also discuss how to develop protein
	folding scoring function. {A}vailability: {A}vailable on request
	from the authors.},
  doi = {10.1093/bioinformatics/bth369},
  pdf = {../local/Hu2004Developing},
  file = {Hu2004Developing:local/Hu2004Developing:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3080}
}
@article{Hu2004Improved,
  author = {Hu, H.J. and Pan, Y. and Harrison, R. and Tai, P.C.},
  title = {Improved protein secondary structure prediction using support vector
	machine with a new encoding scheme and an advanced tertiary classifier},
  journal = {I{EEE} {T}rans. {N}anobioscience},
  year = {2004},
  volume = {3},
  pages = {265-271},
  number = {4},
  abstract = {Prediction of protein secondary structures is an important problem
	in bioinformatics and has many applications. {T}he recent trend of
	secondary structure prediction studies is mostly based on the neural
	network or the support vector machine ({SVM}). {T}he {SVM} method
	is a comparatively new learning system which has mostly been used
	in pattern recognition problems. {I}n this study, {SVM} is used as
	a machine learning tool for the prediction of secondary structure
	and several encoding schemes, including orthogonal matrix, hydrophobicity
	matrix, {BLOSUM}62 substitution matrix, and combined matrix of these,
	are applied and optimized to improve the prediction accuracy. {A}lso,
	the optimal window length for six {SVM} binary classifiers is established
	by testing different window sizes and our new encoding scheme is
	tested based on this optimal window size via sevenfold cross validation
	tests. {T}he results show 2% increase in the accuracy of the binary
	classifiers when compared with the instances in which the classical
	orthogonal matrix is used. {F}inally, to combine the results of the
	six {SVM} binary classifiers, a new tertiary classifier which combines
	the results of one-versus-one binary classifiers is introduced and
	the performance is compared with those of existing tertiary classifiers.
	{A}ccording to the results, the {Q}3 prediction accuracy of new tertiary
	classifier reaches 78.8% and this is better than the best result
	reported in the literature.},
  pdf = {../local/Hu2004Improved.pdf},
  file = {Hu2004Improved.pdf:local/Hu2004Improved.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Hua2005Optimal,
  author = {Hua, J. and Xiong, Z. and Lowey, J. and Suh, E. and Dougherty, E.
	R.},
  title = {Optimal number of features as a function of sample size for various
	classification rules},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1509-1515},
  number = {8},
  month = {Apr},
  note = {To appear},
  abstract = {Motivation: {G}iven the joint feature-label distribution, increasing
	the number of features always results in decreased classification
	error; however, this is not the case when a classifier is designed
	via a classification rule from sample data. {T}ypically (but not
	always), for fixed sample size, the error of a designed classifier
	decreases and then increases as the number of features grows. {T}he
	potential downside of using too many features is most critical for
	small samples, which are commonplace for gene-expression-based classifiers
	for phenotype discrimination. {F}or fixed sample size and feature-label
	distribution, the issue is to find an optimal number of features.{R}esults:
	{S}ince only in rare cases is there a known distribution of the error
	as a function of the number of features and sample size, this study
	employs simulation for various feature-label distributions and classification
	rules, and across a wide range of sample and feature-set sizes. {T}o
	achieve the desired end, finding the optimal number of features as
	a function of sample size, it employs massively parallel computation.
	{S}even classifiers are treated: 3-nearest-neighbor, {G}aussian kernel,
	linear support vector machine, polynomial support vector machine,
	perceptron, regular histogram and linear discriminant analysis. {T}hree
	{G}aussian-based models are considered: linear, nonlinear and bimodal.
	{I}n addition, real patient data from a large breast-cancer study
	is considered. {T}o mitigate the combinatorial search for finding
	optimal feature sets, and to model the situation in which subsets
	of genes are co-regulated and correlation is internal to these subsets,
	we assume that the covariance matrix of the features is blocked,
	with each block corresponding to a group of correlated features.
	{A}ltogether there is a large number of error surfaces for the many
	cases. {T}hese are provided in full on a companion web-site, which
	is meant to serve as resource for those working with small-sample
	classification.{A}vailability: {F}or the companion web-site, please
	visit http://public.tgen.org/tamu/ofs/.},
  doi = {10.1093/bioinformatics/bti171},
  pdf = {../local/Hua2005Optimal.pdf},
  file = {Hua2005Optimal.pdf:local/Hua2005Optimal.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti171v1}
}
@article{Hua2001Novel,
  author = {Hua, S. and Sun, Z.},
  title = {A {N}ovel {M}ethod of {P}rotein {S}econdary {S}tructure {P}rediction
	with {H}igh {S}egment {O}verlap {M}easure: {S}upport {V}ector {M}achine
	{A}pproach},
  journal = {J. {M}ol. {B}iol.},
  year = {2001},
  volume = {308},
  pages = {397--407},
  number = {2},
  month = {April},
  doi = {10.1006/jmbi.2001.4580},
  pdf = {../local/Hua2001Novel.pdf},
  file = {Hua2001Novel.pdf:local/Hua2001Novel.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel}
}
@article{Hua2001Support,
  author = {Hua, S. and Sun, Z.},
  title = {Support vector machine approach for protein subcellular localization
	prediction},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {721-728},
  number = {8},
  abstract = {Motivation: {S}ubcellular localization is a key functional characteristic
	of proteins. {A} fully automatic and reliable prediction system for
	protein subcellular localization is needed, especially for the analysis
	of large-scale genome sequences. {R}esults: {I}n this paper, {S}upport
	{V}ector {M}achine has been introduced to predict the subcellular
	localization of proteins from their amino acid compositions. {T}he
	total prediction accuracies reach 91.4% for three subcellular locations
	in prokaryotic organisms and 79.4% for four locations in eukaryotic
	organisms. {P}redictions by our approach are robust to errors in
	the protein {N}-terminal sequences. {T}his new approach provides
	superior prediction performance compared with existing algorithms
	based on amino acid composition and can be a complementary method
	to other existing methods based on sorting signals. {A}vailability:
	{A} web server implementing the prediction method is available at
	http://www.bioinfo.tsinghua.edu.cn/{S}ub{L}oc/. {C}ontact: sunzhr@mail.tsinghua.edu.cn;
	huasj00@mails.tsinghua.edu.cn {S}upplementary information: {S}upplementary
	material is available at http://www.bioinfo.tsinghua.edu.cn/{S}ub{L}oc},
  pdf = {../local/Hua2001Support.pdf},
  file = {Hua2001Support.pdf:local/Hua2001Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/8/721}
}
@inproceedings{Huan2004Accurate,
  author = {Huan, J. and Wang, W. and Washington, A. and Prins, J. and Shah,
	R. and Tropsha, A.},
  title = {Accurate classification of protein structural families using coherent
	subgraph analysis.},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002},
  year = {2004},
  pages = {411-422},
  abstract = {Protein structural annotation and classification is an important problem
	in bioinformatics. {W}e report on the development of an efficient
	subgraph mining technique and its application to finding characteristic
	substructural patterns within protein structural families. {I}n our
	method, protein structures are represented by graphs where the nodes
	are residues and the edges connect residues found within certain
	distance from each other. {A}pplication of subgraph mining to proteins
	is challenging for a number reasons: (1) protein graphs are large
	and complex, (2) current protein databases are large and continue
	to grow rapidly, and (3) only a small fraction of the frequent subgraphs
	among the huge pool of all possible subgraphs could be significant
	in the context of protein classification. {T}o address these challenges,
	we have developed an information theoretic model called coherent
	subgraph mining. {F}rom information theory, the entropy of a random
	variable {X} measures the information content carried by {X} and
	the {M}utual {I}nformation ({MI}) between two random variables {X}
	and {Y} measures the correlation between {X} and {Y}. {W}e define
	a subgraph {X} as coherent if it is strongly correlated with every
	sufficiently large sub-subgraph {Y} embedded in it. {B}ased on the
	{MI} metric, we have designed a search scheme that only reports coherent
	subgraphs. {T}o determine the significance of coherent protein subgraphs,
	we have conducted an experimental study in which all coherent subgraphs
	were identified in several protein structural families annotated
	in the {SCOP} database ({M}urzin et al, 1995). {T}he {S}upport {V}ector
	{M}achine algorithm was used to classify proteins from different
	families under the binary classification scheme. {W}e find that this
	approach identifies spatial motifs unique to individual {SCOP} families
	and affords excellent discrimination between families.},
  pdf = {../local/Huan2004Accurate.pdf},
  file = {Huan2004Accurate.pdf:local/Huan2004Accurate.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Huang2005Support,
  author = {Jing Huang and Feng Shi},
  title = {Support vector machines for predicting apoptosis proteins types.},
  journal = {Acta {B}iotheor.},
  year = {2005},
  volume = {53},
  pages = {39-47},
  number = {1},
  abstract = {Apoptosis proteins have a central role in the development and homeostasis
	of an organism. {T}hese proteins are very important for understanding
	the mechanism of programmed cell death, and their function is related
	to their types. {A}ccording to the classification scheme by {Z}hou
	and {D}octor (2003), the apoptosis proteins are categorized into
	the following four types: (1) cytoplasmic protein; (2) plasma membrane-bound
	protein; (3) mitochondrial inner and outer proteins; (4) other proteins.
	{A} powerful learning machine, the {S}upport {V}ector {M}achine,
	is applied for predicting the type of a given apoptosis protein by
	incorporating the sqrt-amino acid composition effect. {H}igh success
	rates were obtained by the re-substitute test (98/98 = 100 \%) and
	the jackknife test (89/98 = 90.8\%).},
  doi = {10.1007/s10441-005-7002-5},
  pdf = {../local/Huang2005Support.pdf},
  file = {Huang2005Support.pdf:local/Huang2005Support.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1007/s10441-005-7002-5}
}
@article{Huang2005CTKPred,
  author = {Huang, N. and Chen, H. and Sun, Z.},
  title = {C{TKP}red: an {SVM}-based method for the prediction and classification
	of the cytokine superfamily.},
  journal = {Protein {E}ng. {D}es. {S}el.},
  year = {2005},
  month = {Jun},
  abstract = {Cell proliferation, differentiation and death are controlled by a
	multitude of cell-cell signals and loss of this control has devastating
	consequences. {P}rominent among these regulatory signals is the cytokine
	superfamily, which has crucial functions in the development, differentiation
	and regulation of immune cells. {I}n this study, a support vector
	machine ({SVM})-based method was developed for predicting families
	and subfamilies of cytokines using dipeptide composition. {T}he taxonomy
	of the cytokine superfamily with which our method complies was described
	in the {C}ytokine {F}amily c{DNA} {D}atabase (db{CFC}) and the dataset
	used in this study for training and testing was obtained from the
	db{CFC} and {S}tructural {C}lassification of {P}roteins ({SCOP}).
	{T}he method classified cytokines and non-cytokines with an accuracy
	of 92.5\% by 7-fold cross-validation. {T}he method is further able
	to predict seven major classes of cytokine with an overall accuracy
	of 94.7\%. {A} server for recognition and classification of cytokines
	based on multi-class {SVM}s has been set up at http://bioinfo.tsinghua.edu.cn/~huangni/{CTKP}red/.},
  doi = {10.1093/protein/gzi041},
  pdf = {../local/Huang2005CTKPred.pdf},
  file = {Huang2005CTKPred.pdf:local/Huang2005CTKPred.pdf:PDF},
  keywords = {biosvm},
  pii = {gzi041},
  url = {http://dx.doi.org/10.1093/protein/gzi041}
}
@article{Huang2005Computation,
  author = {Shao-Wei Huang and Jenn-Kang Hwang},
  title = {Computation of conformational entropy from protein sequences using
	the machine-learning method--application to the study of the relationship
	between structural conservation and local structural stability.},
  journal = {Proteins},
  year = {2005},
  volume = {59},
  pages = {802-9},
  number = {4},
  month = {Jun},
  abstract = {A complete protein sequence can usually determine a unique conformation;
	however, the situation is different for shorter subsequences--some
	of them are able to adopt unique conformations, independent of context;
	while others assume diverse conformations in different contexts.
	{T}he conformations of subsequences are determined by the interplay
	between local and nonlocal interactions. {A} quantitative measure
	of such structural conservation or variability will be useful in
	the understanding of the sequence-structure relationship. {I}n this
	report, we developed an approach using the support vector machine
	method to compute the conformational variability directly from sequences,
	which is referred to as the sequence structural entropy. {A}s a practical
	application, we studied the relationship between sequence structural
	entropy and the hydrogen exchange for a set of well-studied proteins.
	{W}e found that the slowest exchange cores usually comprise amino
	acids of the lowest sequence structural entropy. {O}ur results indicate
	that structural conservation is closely related to the local structural
	stability. {T}his relationship may have interesting implications
	in the protein folding processes, and may be useful in the study
	of the sequence-structure relationship.},
  doi = {10.1002/prot.20462},
  pdf = {../local/Huang2005Computation.pdf},
  file = {Huang2005Computation.pdf:local/Huang2005Computation.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1002/prot.20462}
}
@article{Huang2005Gene,
  author = {Huang, T. M. and Kecman, V.},
  title = {Gene extraction for cancer diagnosis by support vector machines-{A}n
	improvement.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2005},
  month = {Jul},
  abstract = {O{BJECTIVE}:: {T}o improve the performance of gene extraction for
	cancer diagnosis by recursive feature elimination with support vector
	machines ({RFE}-{SVM}s): {A} cancer diagnosis by using the {DNA}
	microarray data faces many challenges the most serious one being
	the presence of thousands of genes and only several dozens (at the
	best) of patient's samples. {T}hus, making any kind of classification
	in high-dimensional spaces from a limited number of data is both
	an extremely difficult and a prone to an error procedure. {T}he improved
	{RFE}-{SVM}s is introduced and used here for an elimination of less
	relevant genes and just for a reduction of the overall number of
	genes used in a medical diagnostic. {METHODS}:: {T}he paper shows
	why and how the, usually neglected, penalty parameter {C} and some
	standard data preprocessing techniques (normalizing and scaling)
	influence classification results and the gene selection of {RFE}-{SVM}s.
	{T}he gene selected by {RFE}-{SVM}s is compared with eight other
	gene selection algorithms implemented in the {R}ankgene software
	to investigate whether there is any consensus among the algorithms,
	so the scope of finding the right set of genes can be reduced. {RESULTS}::
	{T}he improved {RFE}-{SVM}s is applied on the two benchmarking colon
	and lymphoma cancer data sets with various {C} parameters and different
	standard preprocessing techniques. {H}ere, decreasing {C} leads to
	the smaller diagnosis error in comparisons to other known methods
	applied to the benchmarking data sets. {W}ith an appropriate parameter
	{C} and with a proper preprocessing procedure, the reduction in a
	diagnosis error is as high as 36\%. {CONCLUSIONS}:: {T}he results
	suggest that with a properly chosen parameter {C}, the extracted
	genes and the constructed classifier will ensure less overfitting
	of the training data leading to an increased accuracy in selecting
	relevant genes. {F}inally, comparison in gene ranking obtained by
	different algorithms shows that there is a significant consensus
	among the various algorithms as to which set of genes is relevant.},
  doi = {10.1016/j.artmed.2005.01.006},
  pdf = {../local/Huang2005Gene.pdf},
  file = {Huang2005Gene.pdf:local/Huang2005Gene.pdf:PDF},
  keywords = {biosvm},
  pii = {S0933-3657(05)00051-5},
  url = {http://dx.doi.org/10.1016/j.artmed.2005.01.006}
}
@article{Hutter2004Prediction,
  author = {Hutter, B. and Schaab, C. and Albrecht, S. and Borgmann, M. and Brunner,
	N. A. and Freiberg, C. and Ziegelbauer, K. and Rock, C. O. and Ivanov,
	I. and Loferer, H.},
  title = {Prediction of {M}echanisms of {A}ction of {A}ntibacterial {C}ompounds
	by {G}ene {E}xpression {P}rofiling},
  journal = {Antimicrob. {A}gents {C}hemother.},
  year = {2004},
  volume = {48},
  pages = {2838-2844},
  number = {8},
  month = {Aug},
  abstract = {We have generated a database of expression profiles carrying the transcriptional
	responses of the model organism {B}acillus subtilis following treatment
	with 37 well-characterized antibacterial compounds of different classes.
	{T}he database was used to build a predictor for the assignment of
	the mechanisms of action ({M}o{A}s) of antibacterial compounds by
	the use of support vector machines. {T}his predictor was able to
	correctly classify the {M}o{A} class for most compounds tested. {F}urthermore,
	we provide evidence that the in vivo {M}o{A} of hexachlorophene does
	not match the {M}o{A} predicted from in vitro data, a situation frequently
	faced in drug discovery. {A} database of this kind may facilitate
	the prioritization of novel antibacterial entities in drug discovery
	programs. {P}otential applications and limitations are discussed.},
  doi = {10.1128/AAC.48.8.2838-2844.2004},
  eprint = {http://aac.asm.org/cgi/reprint/48/8/2838.pdf},
  pdf = {../local/Hutter2004Prediction.pdf},
  file = {Hutter2004Prediction.pdf:local/Hutter2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1128/AAC.48.8.2838-2844.2004}
}
@article{Ifantis2003nonlinear,
  author = {A. Ifantis and S. Papadimitriou},
  title = {The nonlinear predictability of the electrotelluric field variations
	data analyzed with support vector machines as an earthquake precursor.},
  journal = {Int {J} {N}eural {S}yst},
  year = {2003},
  volume = {13},
  pages = {315-32},
  number = {5},
  month = {Oct},
  abstract = {This work investigates the nonlinear predictability of the {E}lectro
	{T}elluric {F}ield ({ETF}) variations data in order to develop new
	intelligent tools for the difficult task of earthquake prediction.
	{S}upport {V}ector {M}achines trained on a signal window have been
	used to predict the next sample. {W}e observe a significant increase
	at this short-term unpredictability of the {ETF} signal at about
	two weeks time period before the major earthquakes that took place
	in regions near the recording devices. {T}he unpredictability increase
	can be attributed to a quick time variation of the dynamics that
	produce the {ETF} signal due to the earthquake generation process.
	{T}hus, this increase can be taken into advantage for signaling for
	an increased possibility of a large earthquake within the next few
	days in the neighboring region of the recording station.},
  keywords = {Air Pollutants, Aircraft, Algorithms, Artificial Intelligence, Automated,
	Base Composition, Comparative Study, Computational Biology, Computer
	Simulation, Computer-Assisted, Computing Methodologies, Cytosine,
	Data Interpretation, Databases, Enhancer Elements (Genetics), Environmental
	Monitoring, Ethanol, Exons, Fourier Transform Infrared, Genetic,
	Guanine, Humans, Image Interpretation, Natural Disasters, Non-P.H.S.,
	Non-U.S. Gov't, Nonlinear Dynamics, Online Systems, P.H.S., Pattern
	Recognition, Photography, Probability, Pyrimidines, RNA Precursors,
	RNA Splice Sites, RNA Splicing, Radiation, Reproducibility of Results,
	Research Support, Sensitivity and Specificity, Signal Processing,
	Spectroscopy, Statistical, Subtraction Technique, Thermodynamics,
	Time Factors, U.S. Gov't, Untranslated Regions, Video Recording,
	Walking, 14652873},
  pii = {S0129065703001674}
}
@article{Imoto2002Estimation,
  author = {Imoto, S. and Goto, T. and Miyano, S.},
  title = {Estimation of genetic networks and functional structures between
	genes by using {B}ayesian networks and nonparametric regression.},
  journal = {Pac. {S}ymp. {B}iocomput.},
  year = {2002},
  pages = {175--186},
  abstract = {We propose a new method for constructing genetic network from gene
	expression data by using {B}ayesian networks. {W}e use nonparametric
	regression for capturing nonlinear relationships between genes and
	derive a new criterion for choosing the network in general situations.
	{I}n a theoretical sense, our proposed theory and methodology include
	previous methods based on {B}ayes approach. {W}e applied the proposed
	method to the {S}. cerevisiae cell cycle data and showed the effectiveness
	of our method by comparing with previous methods.},
  pdf = {../local/Imoto2002Estimation.pdf},
  file = {Imoto2002Estimation.pdf:local/Imoto2002Estimation.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pmid = {11928473},
  timestamp = {2006.02.16},
  url = {http://helix-web.stanford.edu/psb02/imoto.pdf}
}
@article{Imoto2003Bayesian,
  author = {Imoto, S. and Kim, S. and Goto, T. and Miyano, S. and Aburatani,
	S. and Tashiro, K. and Kuhara, S.},
  title = {Bayesian network and nonparametric heteroscedastic regression for
	nonlinear modeling of genetic network.},
  journal = {J. {B}ioinform. {C}omput. {B}iol.},
  year = {2003},
  volume = {1},
  pages = {231--252},
  number = {2},
  month = {Jul},
  abstract = {We propose a new statistical method for constructing a genetic network
	from microarray gene expression data by using a {B}ayesian network.
	{A}n essential point of {B}ayesian network construction is the estimation
	of the conditional distribution of each random variable. {W}e consider
	fitting nonparametric regression models with heterogeneous error
	variances to the microarray gene expression data to capture the nonlinear
	structures between genes. {S}electing the optimal graph, which gives
	the best representation of the system among genes, is still a problem
	to be solved. {W}e theoretically derive a new graph selection criterion
	from {B}ayes approach in general situations. {T}he proposed method
	includes previous methods based on {B}ayesian networks. {W}e demonstrate
	the effectiveness of the proposed method through the analysis of
	{S}accharomyces cerevisiae gene expression data newly obtained by
	disrupting 100 genes.},
  doi = {10.1142/S0219720003000071},
  pdf = {../local/Imoto2003Bayesian.pdf},
  file = {Imoto2003Bayesian.pdf:local/Imoto2003Bayesian.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {S0219720003000071},
  pmid = {15290771},
  timestamp = {2006.02.16},
  url = {http://dx.doi.org/10.1142/S0219720003000071}
}
@article{Imoto2002Bayesian,
  author = {Imoto, S. and Sunyong, K. and Goto, T. and Aburatani, S. and Tashiro,
	K. and Kuhara, S. and Miyano, S.},
  title = {Bayesian network and nonparametric heteroscedastic regression for
	nonlinear modeling of genetic network.},
  journal = {Proc. {IEEE} {C}omput. {S}oc. {B}ioinform. {C}onf.},
  year = {2002},
  volume = {1},
  pages = {219--227},
  abstract = {We propose a new statistical method for constructing genetic network
	from microarray gene expression data by using a {B}ayesian network.
	{A}n essential point of {B}ayesian network construction is in the
	estimation of the conditional distribution of each random variable.
	{W}e consider fitting nonparametric regression models with heterogeneous
	error variances to the microarray gene expression data to capture
	the nonlinear structures between genes. {A} problem still remains
	to be solved in selecting an optimal graph, which gives the best
	representation of the system among genes. {W}e theoretically derive
	a new graph selection criterion from {B}ayes approach in general
	situations. {T}he proposed method includes previous methods based
	on {B}ayesian networks. {W}e demonstrate the effectiveness of the
	proposed method through the analysis of {S}accharomyces cerevisiae
	gene expression data newly obtained by disrupting 100 genes.},
  doi = {10.1109/CSB.2002.1039344},
  pdf = {../local/Imoto2002Bayesian.pdf},
  file = {Imoto2002Bayesian.pdf:local/Imoto2002Bayesian.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pmid = {15838138},
  timestamp = {2006.02.16},
  url = {http://dx.doi.org/10.1109/CSB.2002.1039344}
}
@article{Consortium2001Initial,
  author = {{International Human Genome Sequencing Consortium}},
  title = {Initial sequencing and analysis of the human genome},
  journal = {Nature},
  year = {2001},
  volume = {409},
  pages = {860-921},
  number = {6822},
  month = {Feb},
  abstract = {The human genome holds an extraordinary trove of information about
	human development, physiology, medicine and evolution. {H}ere we
	report the results of an international collaboration to produce and
	make freely available a draft sequence of the human genome. {W}e
	also present an initial analysis of the data, describing some of
	the insights that can be gleaned from the sequence.},
  doi = {10.1038/35057062},
  pdf = {../local/Consortium2001Initial.pdf},
  file = {Consortium2001Initial.pdf:local/Consortium2001Initial.pdf:PDF},
  keywords = {genomics bio},
  owner = {vert},
  url = {http://dx.doi.org/10.1038/35057062 }
}
@article{Ito2001comprehensive,
  author = {Ito, T. and Chiba, T. and Ozawa, R. and Yoshida, M. and Hattori,
	M. and Sakaki, Y.},
  title = {A comprehensive two-hybrid analysis to explore the yeast protein
	interactome},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {2001},
  volume = {98},
  pages = {4569--4574},
  number = {8},
  pdf = {../local/ito01.pdf},
  file = {ito01.pdf:local/ito01.pdf:PDF},
  subject = {bionet},
  url = {http://www.pnas.org/cgi/content/full/98/8/4569}
}
@article{Ito2000Toward,
  author = {Ito, T. and Tashiro, K. and Muta, S. and Ozawa, R. and Chiba, T.
	and Nishizawa, M. and Yamamoto, K. and Kuhara, S. and Sakaki, Y.},
  title = {Toward a protein-protein interaction map of the budding yeast: {A}
	comprehensive system to examine two-hybrid interactions in all possible
	combinations between the yeast proteins},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {2000},
  volume = {93},
  pages = {1143--1147},
  number = {3},
  pdf = {../local/ito00.pdf},
  file = {ito00.pdf:local/ito00.pdf:PDF},
  subject = {bionet},
  url = {http://www.pnas.org/cgi/content/full/97/3/1143}
}
@article{Jaakkola2000Discriminative,
  author = {Jaakkola, T. and Diekhans, M. and Haussler, D.},
  title = {A {D}iscriminative {F}ramework for {D}etecting {R}emote {P}rotein
	{H}omologies},
  journal = {J. {C}omput. {B}iol.},
  year = {2000},
  volume = {7},
  pages = {95--114},
  number = {1,2},
  pdf = {../local/jaak00.pdf},
  file = {jaak00.pdf:local/jaak00.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernelcasp},
  url = {http://www.cse.ucsc.edu/research/compbio/discriminative/Jaakola2-1998.ps}
}
@inproceedings{Jaakkola1999Using,
  author = {Jaakkola, T. S. and Diekhans, M. and Haussler, D.},
  title = {Using the {F}isher {K}ernel {M}ethod to {D}etect {R}emote {P}rotein
	{H}omologies},
  booktitle = {Proceedings of the {S}eventh {I}nternational {C}onference on {I}ntelligent
	{S}ystems for {M}olecular {B}iology},
  year = {1999},
  pages = {149--158},
  publisher = {AAAI Press},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@inproceedings{Jaakkola1999Exploiting,
  author = {Jaakkola, T. S. and Haussler, D.},
  title = {Exploiting generative models in discriminative classifiers},
  booktitle = {Proc. of {T}enth {C}onference on {A}dvances in {N}eural {I}nformation
	{P}rocessing {S}ystems},
  year = {1999},
  pdf = {../local/jaak99.pdf},
  file = {jaak99.pdf:local/jaak99.pdf:PDF},
  keywords = {biosvm},
  subject = {kernel},
  url = {http://www.cse.ucsc.edu/research/ml/papers/Jaakola.ps}
}
@article{Jambon2003New,
  author = {Martin Jambon and Anne Imberty and Gilbert Deléage and Christophe
	Geourjon},
  title = {A new bioinformatic approach to detect common 3D sites in protein
	structures.},
  journal = {Proteins},
  year = {2003},
  volume = {52},
  pages = {137--145},
  number = {2},
  month = {Aug},
  abstract = {An innovative bioinformatic method has been designed and implemented
	to detect similar three-dimensional (3D) sites in proteins. This
	approach allows the comparison of protein structures or substructures
	and detects local spatial similarities: this method is completely
	independent from the amino acid sequence and from the backbone structure.
	In contrast to already existing tools, the basis for this method
	is a representation of the protein structure by a set of stereochemical
	groups that are defined independently from the notion of amino acid.
	An efficient heuristic for finding similarities that uses graphs
	of triangles of chemical groups to represent the protein structures
	has been developed. The implementation of this heuristic constitutes
	a software named SuMo (Surfing the Molecules), which allows the dynamic
	definition of chemical groups, the selection of sites in the proteins,
	and the management and screening of databases. To show the relevance
	of this approach, we focused on two extreme examples illustrating
	convergent and divergent evolution. In two unrelated serine proteases,
	SuMo detects one common site, which corresponds to the catalytic
	triad. In the legume lectins family composed of >100 structures that
	share similar sequences and folds but may have lost their ability
	to bind a carbohydrate molecule, SuMo discriminates between functional
	and non-functional lectins with a selectivity of 96\%. The time needed
	for searching a given site in a protein structure is typically 0.1
	s on a PIII 800MHz/Linux computer; thus, in further studies, SuMo
	will be used to screen the PDB.},
  doi = {10.1002/prot.10339},
  institution = {Institut de Biologie et Chimie des Protéines (IBCP), Lyon, France.},
  keywords = {Algorithms; Catalytic Domain; Chymotrypsin, chemistry/genetics; Computational
	Biology, methods; Evolution, Molecular; Fabaceae, chemistry; Models,
	Molecular; Plant Lectins, chemistry/genetics; Protein Conformation;
	Proteins, chemistry; Reproducibility of Results; Subtilisin, chemistry/genetics},
  owner = {bricehoffmann},
  pmid = {12833538},
  timestamp = {2009.02.13},
  url = {http://dx.doi.org/10.1002/prot.10339}
}
@article{Jansen2003Bayesian,
  author = {Jansen, R. and Yu, H. and Greenbaum, D. and Kluger, Y. and Krogan,
	N.J. and Chung, S. and Emili, A. and Snyder, M. and Greenblatt, J.F.
	and Gerstein, M.},
  title = {A {B}ayesian networks approach for predicting protein-protein interactions
	from genomic data},
  journal = {Science},
  year = {2003},
  volume = {302},
  pages = {449-453},
  number = {5644},
  abstract = {We have developed an approach using {B}ayesian networks to predict
	protein-protein interactions genome-wide in yeast. {O}ur method naturally
	weights and combines into reliable predictions genomic features only
	weakly associated with interaction (e.g., m{RNA} coexpression, coessentiality,
	and colocalization). {I}n addition to de novo predictions, it can
	integrate often noisy, experimental interaction data sets. {W}e observe
	that at given levels of sensitivity, our predictions are more accurate
	than the existing high-throughput experimental data sets. {W}e validate
	our predictions with new {TAP}?tagging (tandem affinity purification)
	experiments.},
  doi = {10.1126/science.1087361},
  pdf = {../local/Jansen2003Bayesian.pdf},
  file = {Jansen2003Bayesian.pdf:local/Jansen2003Bayesian.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  url = {http://dx.doi.org/10.1126/science.1087361}
}
@article{Jarzab2005Gene,
  author = {Barbara Jarzab and Malgorzata Wiench and Krzysztof Fujarewicz and
	Krzysztof Simek and Michal Jarzab and Malgorzata Oczko-Wojciechowska
	and Jan Wloch and Agnieszka Czarniecka and Ewa Chmielik and Dariusz
	Lange and Agnieszka Pawlaczek and Sylwia Szpak and Elzbieta Gubala
	and Andrzej Swierniak},
  title = {Gene expression profile of papillary thyroid cancer: sources of variability
	and diagnostic implications.},
  journal = {Cancer {R}es.},
  year = {2005},
  volume = {65},
  pages = {1587-97},
  number = {4},
  month = {Feb},
  abstract = {The study looked for an optimal set of genes differentiating between
	papillary thyroid cancer ({PTC}) and normal thyroid tissue and assessed
	the sources of variability in gene expression profiles. {T}he analysis
	was done by oligonucleotide microarrays ({G}ene{C}hip {HG}-{U}133{A})
	in 50 tissue samples taken intraoperatively from 33 patients (23
	{PTC} patients and 10 patients with other thyroid disease). {I}n
	the initial group of 16 {PTC} and 16 normal samples, we assessed
	the sources of variability in the gene expression profile by singular
	value decomposition which specified three major patterns of variability.
	{T}he first and the most distinct mode grouped transcripts differentiating
	between tumor and normal tissues. {T}wo consecutive modes contained
	a large proportion of immunity-related genes. {T}o generate a multigene
	classifier for tumor-normal difference, we used support vector machines-based
	technique (recursive feature replacement). {I}t included the following
	19 genes: {DPP}4, {GJB}3, {ST}14, {SERPINA}1, {LRP}4, {MET}, {EVA}1,
	{SPUVE}, {LGALS}3, {HBB}, {MKRN}2, {MRC}2, {IGSF}1, {KIAA}0830, {RXRG},
	{P}4{HA}2, {CDH}3, {IL}13{RA}1, and {MTMR}4, and correctly discriminated
	17 of 18 additional {PTC}/normal thyroid samples and all 16 samples
	published in a previous microarray study. {S}elected novel genes
	({LRP}4, {EVA}1, {TMPRSS}4, {QPCT}, and {SLC}34{A}2) were confirmed
	by {Q}-{PCR}.{O}ur results prove that the gene expression signal
	of {PTC} is easily detectable even when cancer cells do not prevail
	over tumor stroma. {W}e indicate and separate the confounding variability
	related to the immune response. {F}inally, we propose a potent molecular
	classifier able to discriminate between {PTC} and nonmalignant thyroid
	in more than 90\% of investigated samples.},
  doi = {10.1158/0008-5472.CAN-04-3078},
  pdf = {../local/Jarzab2005Gene.pdf},
  file = {Jarzab2005Gene.pdf:local/Jarzab2005Gene.pdf:PDF},
  keywords = {biosvm},
  pii = {65/4/1587},
  url = {http://dx.doi.org/10.1158/0008-5472.CAN-04-3078}
}
@article{Jeong2001Lethality,
  author = {H. Jeong and S. P. Mason and A.-L. Barab{\'a}si and Z. N. Oltvai},
  title = {Lethality and centrality in protein networks},
  journal = {Nature},
  year = {2001},
  volume = {411},
  pages = {41--42},
  pdf = {../local/jeon01.pdf},
  file = {jeon01.pdf:local/jeon01.pdf:PDF},
  subject = {bionet},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v411/n6833/full/411041a0_fs.html&content_filetype=PDF}
}
@article{Jeong2000large-scale,
  author = {H. Jeong and B. Tombor and R. Albert and Z. N. Oltvai and A.-L. Barab{\'a}si},
  title = {The large-scale organization of metabolic networks},
  journal = {Nature},
  year = {2000},
  volume = {407},
  pages = {651--654},
  pdf = {../local/jeon00.pdf},
  file = {jeon00.pdf:local/jeon00.pdf:PDF},
  subject = {bionet},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v407/n6804/full/407651a0_fs.html&content_filetype=PDF}
}
@article{Jiang-Ning2004Cooperativity,
  author = {Jiang-Ning, S. and Wei-Jiang, L. and Wen-Bo, X.},
  title = {Cooperativity of the oxidization of cysteines in globular proteins.},
  journal = {J. {T}heor. {B}iol.},
  year = {2004},
  volume = {231},
  pages = {85-95},
  number = {1},
  abstract = {Based on the 639 non-homologous proteins with 2910 cysteine-containing
	segments of well-resolved three-dimensional structures, a novel approach
	has been proposed to predict the disulfide-bonding state of cysteines
	in proteins by constructing a two-stage classifier combining a first
	global linear discriminator based on their amino acid composition
	and a second local support vector machine classifier. {T}he overall
	prediction accuracy of this hybrid classifier for the disulfide-bonding
	state of cysteines in proteins has scored 84.1% and 80.1%, when measured
	on cysteine and protein basis using the rigorous jack-knife procedure,
	respectively. {I}t shows that whether cysteines should form disulfide
	bonds depends not only on the global structural features of proteins
	but also on the local sequence environment of proteins. {T}he result
	demonstrates the applicability of this novel method and provides
	comparable prediction performance compared with existing methods
	for the prediction of the oxidation states of cysteines in proteins.},
  doi = {10.1016/j.jtbi.2004.06.002},
  pdf = {../local/Jiang-Ning2004Cooperativity.pdf},
  file = {Jiang-Ning2004Cooperativity.pdf:local/Jiang-Ning2004Cooperativity.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.jtbi.2004.06.002}
}
@article{Jorissen2005Virtual,
  author = {R. N. Jorissen and M. K. Gilson},
  title = {Virtual screening of molecular databases using a support vector machine.},
  journal = {J {C}hem {I}nf {M}odel},
  year = {2005},
  volume = {45},
  pages = {549-61},
  number = {3},
  abstract = {The {S}upport {V}ector {M}achine ({SVM}) is an algorithm that derives
	a model used for the classification of data into two categories and
	which has good generalization properties. {T}his study applies the
	{SVM} algorithm to the problem of virtual screening for molecules
	with a desired activity. {I}n contrast to typical applications of
	the {SVM}, we emphasize not classification but enrichment of actives
	by using a modified version of the standard {SVM} function to rank
	molecules. {T}he method employs a simple and novel criterion for
	picking molecular descriptors and uses cross-validation to select
	{SVM} parameters. {T}he resulting method is more effective at enriching
	for active compounds with novel chemistries than binary fingerprint-based
	methods such as binary kernel discrimination.},
  doi = {10.1021/ci049641u},
  pdf = {../local/Jorissen2005Virtual.pdf},
  file = {Jorissen2005Virtual.pdf:local/Jorissen2005Virtual.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049641u}
}
@article{Jovanovic2010epigenetics,
  author = {Jovana Jovanovic and Jo Anders Rønneberg and Jörg Tost and Vessela
	Kristensen},
  title = {The epigenetics of breast cancer.},
  journal = {Mol Oncol},
  year = {2010},
  volume = {4},
  pages = {242--254},
  number = {3},
  month = {Jun},
  abstract = {Epigenetic changes can be defined as stable molecular alterations
	of a cellular phenotype such as the gene expression profile of a
	cell that are heritable during somatic cell divisions (and sometimes
	germ line transmissions) but do not involve changes of the DNA sequence
	itself. Epigenetic phenomena are mediated by several molecular mechanisms
	comprising histone modifications, polycomb/trithorax protein complexes,
	small non-coding or antisense RNAs and DNA methylation. These different
	modifications are closely interconnected. Epigenetic regulation is
	critical in normal growth and development and closely conditions
	the transcriptional potential of genes. Epigenetic mechanisms convey
	genomic adaption to an environment thereby ultimately contributing
	towards given phenotype. In this review we will describe the various
	aspects of epigenetics and in particular DNA methylation in breast
	carcinogenesis and their potential application for diagnosis, prognosis
	and treatment decision.},
  doi = {10.1016/j.molonc.2010.04.002},
  institution = {Department for Clinical Molecular Biology (EpiGen), Institute for
	Clinical Medicine, Akershus University Hospital, University of Oslo,
	Norway.},
  keywords = {Breast Neoplasms, diagnosis/genetics/pathology/therapy; Chromatin,
	chemistry/metabolism; DNA Methylation; DNA Modification Methylases,
	metabolism; DNA, chemistry/metabolism; Epigenesis, Genetic; Female;
	Gene Expression Regulation, Neoplastic; Histones, metabolism; Humans;
	MicroRNAs, genetics/metabolism; Molecular Structure; Prognosis; Receptors,
	Estrogen, genetics/metabolism; Tumor Markers, Biological, metabolism},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {S1574-7891(10)00024-4},
  pmid = {20627830},
  timestamp = {2011.06.04},
  url = {http://dx.doi.org/10.1016/j.molonc.2010.04.002}
}
@article{Jonsdottir2005Prediction,
  author = {Svava Osk Jónsdóttir and Flemming Steen Jørgensen and Søren Brunak},
  title = {Prediction methods and databases within chemoinformatics: emphasis
	on drugs and drug candidates.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2145--2160},
  number = {10},
  month = {May},
  abstract = {MOTIVATION: To gather information about available databases and chemoinformatics
	methods for prediction of properties relevant to the drug discovery
	and optimization process. RESULTS: We present an overview of the
	most important databases with 2-dimensional and 3-dimensional structural
	information about drugs and drug candidates, and of databases with
	relevant properties. Access to experimental data and numerical methods
	for selecting and utilizing these data is crucial for developing
	accurate predictive in silico models. Many interesting predictive
	methods for classifying the suitability of chemical compounds as
	potential drugs, as well as for predicting their physico-chemical
	and ADMET properties have been proposed in recent years. These methods
	are discussed, and some possible future directions in this rapidly
	developing field are described.},
  doi = {10.1093/bioinformatics/bti314},
  keywords = {Chemistry, Pharmaceutical; Computational Biology; Databases, Factual;
	Drug Design; Models, Chemical; Models, Molecular; Pharmaceutical
	Preparations; Structure-Activity Relationship},
  owner = {vert},
  pii = {bti314},
  pmid = {15713739},
  timestamp = {2007.08.02},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti314}
}
@article{Kanehisa1997database,
  author = {M. Kanehisa},
  title = {A database for post-genome analysis},
  journal = {Trends {G}enet.},
  year = {1997},
  volume = {13},
  pages = {375--376},
  doi = {10.1016/S0168-9525(97)01223-7},
  pdf = {../local/Kanehisa1997database.pdf},
  file = {Kanehisa1997database.pdf:local/Kanehisa1997database.pdf:PDF},
  subject = {bionet},
  url = {http://dx.doi.org/10.1016/S0168-9525(97)01223-7}
}
@article{Kanehisa2002KEGG,
  author = {M. Kanehisa and S. Goto and S. Kawashima and A. Nakaya},
  title = {The {KEGG} databases at {G}enome{N}et},
  journal = {Nucleic {A}cids {R}es.},
  year = {2002},
  volume = {30},
  pages = {42--46},
  pdf = {../local/kane02.pdf},
  file = {kane02.pdf:local/kane02.pdf:PDF},
  subject = {bionet},
  url = {http://nar.oupjournals.org/cgi/content/full/30/1/42}
}
@article{Kaper2004BCI,
  author = {Matthias Kaper and Peter Meinicke and Ulf Grossekathoefer and Thomas
	Lingner and Helge Ritter},
  title = {B{CI} {C}ompetition 2003--{D}ata set {II}b: support vector machines
	for the {P}300 speller paradigm.},
  journal = {I{EEE} {T}rans {B}iomed {E}ng},
  year = {2004},
  volume = {51},
  pages = {1073-6},
  number = {6},
  month = {Jun},
  abstract = {We propose an approach to analyze data from the {P}300 speller paradigm
	using the machine-learning technique support vector machines. {I}n
	a conservative classification scheme, we found the correct solution
	after five repetitions. {W}hile the classification within the competition
	is designed for offline analysis, our approach is also well-suited
	for a real-world online solution: {I}t is fast, requires only 10
	electrode positions and demands only a small amount of preprocessing.},
  keywords = {Algorithms, Animals, Antisense, Artificial Intelligence, Automated,
	Autonomic Nervous System, Brain, Cell Line, Child, Cluster Analysis,
	Cognition, Comparative Study, Computational Biology, Computer Simulation,
	Computer-Assisted, DNA Fingerprinting, Databases, Drug Evaluation,
	Electroencephalography, Emotions, Event-Related Potentials, Factual,
	Fluorescence, Fuzzy Logic, Gene Silencing, Gene Targeting, Genetic,
	Hela Cells, Humans, Imaging, Intracellular Space, Microscopy, Models,
	Monitoring, Neoplasms, Neural Networks (Computer), Non-U.S. Gov't,
	Oligonucleotides, P.H.S., P300, Pattern Recognition, Peptides, Physiologic,
	Preclinical, Predictive Value of Tests, Preschool, Prognosis, Protein
	Interaction Mapping, Protein Structure, Proteins, Proteomics, Quantitative
	Structure-Activity Relationship, Quaternary, RNA, RNA Interference,
	Recognition (Psychology), Reproducibility of Results, Research Support,
	Sensitivity and Specificity, Signal Processing, Small Interfering,
	Software, Thionucleotides, Three-Dimensional, Tumor, U.S. Gov't,
	User-Computer Interface, Word Processing, 15188881}
}
@article{Kapetanovic2004Overview,
  author = {Izet M Kapetanovic and Simon Rosenfeld and Grant Izmirlian},
  title = {Overview of commonly used bioinformatics methods and their applications.},
  journal = {Ann {N} {Y} {A}cad {S}ci},
  year = {2004},
  volume = {1020},
  pages = {10-21},
  month = {May},
  abstract = {Bioinformatics, in its broad sense, involves application of computer
	processes to solve biological problems. {A} wide range of computational
	tools are needed to effectively and efficiently process large amounts
	of data being generated as a result of recent technological innovations
	in biology and medicine. {A} number of computational tools have been
	developed or adapted to deal with the experimental riches of complex
	and multivariate data and transition from data collection to information
	or knowledge. {T}hese include a wide variety of clustering and classification
	algorithms, including self-organized maps ({SOM}), artificial neural
	networks ({ANN}), support vector machines ({SVM}), fuzzy logic, and
	even hyphenated techniques as neuro-fuzzy networks. {T}hese bioinformatics
	tools are being evaluated and applied in various medical areas including
	early detection, risk assessment, classification, and prognosis of
	cancer. {T}he goal of these efforts is to develop and identify bioinformatics
	methods with optimal sensitivity, specificity, and predictive capabilities.},
  doi = {10.1196/annals.1310.003},
  pdf = {../local/Kapetanovic2004Overview.pdf},
  file = {Kapetanovic2004Overview.pdf:local/Kapetanovic2004Overview.pdf:PDF},
  keywords = {Computational Biology, Fuzzy Logic, Humans, Neoplasms, Neural Networks
	(Computer), Prognosis, 15208179},
  pii = {1020/1/10},
  url = {http://dx.doi.org/10.1196/annals.1310.003}
}
@article{Karchin2002Classifying,
  author = {Karchin, R. and Karplus, K. and Haussler, D.},
  title = {Classifying {G}-protein coupled receptors with support vector machines},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {147--159},
  abstract = {Motivation: {T}he enormous amount of protein sequence data uncovered
	by genome research has increased the demand for computer software
	that can automate the recognition of new proteins. {W}e discuss the
	relative merits of various automated methods for recognizing {G}-{P}rotein
	{C}oupled {R}eceptors ({GPCR}s), a superfamily of cell membrane proteins.
	{GPCR}s are found in a wide range of organisms and are central to
	a cellular signalling network that regulates many basic physiological
	processes. {T}hey are the focus of a significant amount of current
	pharmaceutical research because they play a key role in many diseases.
	{H}owever, their tertiary structures remain largely unsolved. {T}he
	methods described in this paper use only primary sequence information
	to make their predictions. {W}e compare a simple nearest neighbor
	approach ({BLAST}), methods based on multiple alignments generated
	by a statistical profile {H}idden {M}arkov {M}odel ({HMM}), and methods,
	including {S}upport {V}ector {M}achines ({SVM}s), that transform
	protein sequences into fixed-length feature vectors. {R}esults: {T}he
	last is the most computationally expensive method, but our experiments
	show that, for those interested in annotation-quality classification,
	the results are worth the effort. {I}n two-fold cross-validation
	experiments testing recognition of {GPCR} subfamilies that bind a
	specific ligand (such as a histamine molecule), the errors per sequence
	at the {M}inimum {E}rror {P}oint ({MEP}) were 13.7% for multi-class
	{SVM}s, 17.1% for our {SVM}tree method of hierarchical multi-class
	{SVM} classification, 25.5% for {BLAST}, 30% for profile {HMM}s,
	and 49% for classification based on nearest neighbor feature vector
	{K}ernel {N}earest {N}eighbor (kern{NN}). {T}he percentage of true
	positives recognized before the first false positive was 65% for
	both {SVM} methods, 13% for {BLAST}, 5% for profile {HMM}s and 4%
	for kern{NN}. {A}vailability: {W}e have set up a web server for {GPCR}
	subfamily classification based on hierarchical multi-class {SVM}s
	at http://www.soe.ucsc.edu/research/compbio/gpcr-subclass. {B}y scanning
	predicted peptides found in the human genome with the {SVM}tree server,
	we have identified a large number of genes that encode {GPCR}s. {A}
	list of our predictions for human {GPCR}s is available at http://www.soe.ucsc.edu/research/compbio/gpcr·hg/class·results.
	{W}e also provide suggested subfamily classification for 18 sequences
	previously identified as unclassified {C}lass {A} (rhodopsin-like)
	{GPCR}s in {GPCRDB} ({H}orn et al. , {N}ucleic {A}cids {R}es. , 26,
	277?281, 1998), available at http://www.soe.ucsc.edu/research/compbio/gpcr/class{A}·unclassified/},
  comment = {Un papier intéressant sur l'utilisation du Fisher kernel pour classer
	les GPCR, une famille de protéines importante pour l'industrie pharmaceutique.},
  pdf = {../local/Karchin2002Classifying.pdf},
  file = {Karchin2002Classifying.pdf:local/Karchin2002Classifying.pdf:PDF},
  keywords = {fisher-kernel sequence-classification biosvm},
  subject = {biokernel},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/18/1/147}
}
@article{Karchin2005Improving,
  author = {R. Karchin and L. Kelly and A. Sali},
  title = {Improving functional annotation of non-synonomous {SNP}s with information
	theory.},
  journal = {Pac {S}ymp {B}iocomput},
  year = {2005},
  pages = {397-408},
  abstract = {Automated functional annotation of ns{SNP}s requires that amino-acid
	residue changes are represented by a set of descriptive features,
	such as evolutionary conservation, side-chain volume change, effect
	on ligand-binding, and residue structural rigidity. {I}dentifying
	the most informative combinations of features is critical to the
	success of a computational prediction method. {W}e rank 32 features
	according to their mutual information with functional effects of
	amino-acid substitutions, as measured by in vivo assays. {I}n addition,
	we use a greedy algorithm to identify a subset of highly informative
	features. {T}he method is simple to implement and provides a quantitative
	measure for selecting the best predictive features given a set of
	features that a human expert believes to be informative. {W}e demonstrate
	the usefulness of the selected highly informative features by cross-validated
	tests of a computational classifier, a support vector machine ({SVM}).
	{T}he {SVM}'s classification accuracy is highly correlated with the
	ranking of the input features by their mutual information. {T}wo
	features describing the solvent accessibility of "wild-type" and
	"mutant" amino-acid residues and one evolutionary feature based on
	superfamily-level multiple alignments produce comparable overall
	accuracy and 6\% fewer false positives than a 32-feature set that
	considers physiochemical properties of amino acids, protein electrostatics,
	amino-acid residue flexibility, and binding interactions.},
  keywords = {biosvm}
}
@article{Karklin2005Classification,
  author = {Karklin, Y. and Meraz, R. F. and Holbrook, S.R.},
  title = {Classification of non-coding {RNA} using graph representations of
	secondary structure.},
  journal = {Pac. {S}ymp. {B}iocomput.},
  year = {2005},
  pages = {4-15},
  abstract = {Some genes produce transcripts that function directly in regulatory,
	catalytic, or structural roles in the cell. {T}hese non-coding {RNA}s
	are prevalent in all living organisms, and methods that aid the understanding
	of their functional roles are essential. {RNA} secondary structure,
	the pattern of base-pairing, contains the critical information for
	determining the three dimensional structure and function of the molecule.
	{I}n this work we examine whether the basic geometric and topological
	properties of secondary structure are sufficient to distinguish between
	{RNA} families in a learning framework. {F}irst, we develop a labeled
	dual graph representation of {RNA} secondary structure by adding
	biologically meaningful labels to the dual graphs proposed by {G}an
	et al [1]. {N}ext, we define a similarity measure directly on the
	labeled dual graphs using the recently developed marginalized kernels
	[2]. {U}sing this similarity measure, we were able to train {S}upport
	{V}ector {M}achine classifiers to distinguish {RNA}s of known families
	from random {RNA}s with similar statistics. {F}or 22 of the 25 families
	tested, the classifier achieved better than 70\% accuracy, with much
	higher accuracy rates for some families. {T}raining a set of classifiers
	to automatically assign family labels to {RNA}s using a one vs. all
	multi-class scheme also yielded encouraging results. {F}rom these
	initial learning experiments, we suggest that the labeled dual graph
	representation, together with kernel machine methods, has potential
	for use in automated analysis and classification of uncharacterized
	{RNA} molecules or efficient genome-wide screens for {RNA} molecules
	from existing families.},
  keywords = {biosvm},
  url = {http://helix-web.stanford.edu/psb05/karklin.pdf}
}
@article{BioCyc2005,
  author = {Karp, P. D. and Ouzounis, C. A. and Moore-Kochlacs, C. and Goldovsky,
	L. and Kaipa, P. and Ahren, D. and Tsoka, S. and Darzentas, N. and
	Kunin, V. and Lopez-Bigas, N.},
  title = {Expansion of the {B}io{C}yc collection of pathway/genome databases
	to 160 genomes},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {6083-9},
  number = {19},
  abstract = {The {B}io{C}yc database collection is a set of 160 pathway/genome
	databases ({PGDB}s) for most eukaryotic and prokaryotic species whose
	genomes have been completely sequenced to date. {E}ach {PGDB} in
	the {B}io{C}yc collection describes the genome and predicted metabolic
	network of a single organism, inferred from the {M}eta{C}yc database,
	which is a reference source on metabolic pathways from multiple organisms.
	{I}n addition, each bacterial {PGDB} includes predicted operons for
	the corresponding species. {T}he {B}io{C}yc collection provides a
	unique resource for computational systems biology, namely global
	and comparative analyses of genomes and metabolic networks, and a
	supplement to the {B}io{C}yc resource of curated {PGDB}s. {T}he {O}mics
	viewer available through the {B}io{C}yc website allows scientists
	to visualize combinations of gene expression, proteomics and metabolomics
	data on the metabolic maps of these organisms. {T}his paper discusses
	the computational methodology by which the {B}io{C}yc collection
	has been expanded, and presents an aggregate analysis of the collection
	that includes the range of number of pathways present in these organisms,
	and the most frequently observed pathways. {W}e seek scientists to
	adopt and curate individual {PGDB}s within the {B}io{C}yc collection.
	{O}nly by harnessing the expertise of many scientists we can hope
	to produce biological databases, which accurately reflect the depth
	and breadth of knowledge that the biomedical research community is
	producing.},
  keywords = {Animals Computational Biology *Databases, Genetic *Genome Genome,
	Archaeal Genome, Bacterial Genomics Humans Metabolism/genetics Research
	Support, N.I.H., Extramural Research Support, Non-U.S. Gov't Research
	Support, U.S. Gov't, P.H.S.}
}
@article{Karplus1998Hidden,
  author = {Karplus, K. and Barrett, C. and Hughey, R.},
  title = {Hidden {M}arkov {M}odels for {D}etecting {R}emote {P}rotein {H}omologies},
  journal = {Bioinformatics},
  year = {1998},
  volume = {14},
  pages = {846--856},
  number = {10},
  pdf = {../local/karp98.pdf},
  file = {karp98.pdf:local/karp98.pdf:PDF},
  subject = {biocasp},
  url = {http://www.cse.ucsc.edu/research/compbio/papers/w9824.ps}
}
@inproceedings{Kashima2003Marginalized,
  author = {Kashima, H. and Tsuda, K. and Inokuchi, A.},
  title = {Marginalized {K}ernels between {L}abeled {G}raphs},
  booktitle = {Proceedings of the {T}wentieth {I}nternational {C}onference on {M}achine
	{L}earning},
  year = {2003},
  editor = {Faucett, T. and Mishra, N.},
  pages = {321-328},
  address = {New York, NY, USA},
  publisher = {AAAI Press},
  pdf = {../local/Kashima2003Marginalized.pdf},
  file = {Kashima2003Marginalized.pdf:local/Kashima2003Marginalized.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@incollection{Kashima2004Kernels,
  author = {Kashima, H. and Tsuda, K. and Inokuchi, A.},
  title = {Kernels for graphs},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {155-170},
  address = {The MIT Press, Cambridge, Massachussetts},
  keywords = {biosvm chemoinformatics},
  owner = {vert}
}
@article{Kharchenko2004Filling,
  author = {Kharchenko, P. and Vitkup, D. and Church, G. M.},
  title = {{F}illing gaps in a metabolic network using expression information.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20 Suppl 1},
  pages = {I178--I185},
  month = {Aug},
  abstract = {MOTIVATION: The metabolic models of both newly sequenced and well-studied
	organisms contain reactions for which the enzymes have not been identified
	yet. We present a computational approach for identifying genes encoding
	such missing metabolic enzymes in a partially reconstructed metabolic
	network. RESULTS: The metabolic expression placement (MEP) method
	relies on the coexpression properties of the metabolic network and
	is complementary to the sequence homology and genome context methods
	that are currently being used to identify missing metabolic genes.
	The MEP algorithm predicts over 20\% of all known Saccharomyces cerevisiae
	metabolic enzyme-encoding genes within the top 50 out of 5594 candidates
	for their enzymatic function, and 70\% of metabolic genes whose expression
	level has been significantly perturbed across the conditions of the
	expression dataset used. AVAILABILITY: Freely available (in Supplementary
	information). SUPPLEMENTARY INFORMATION: Available at the following
	URL http://arep.med.harvard.edu/kharchenko/mep/supplements.html},
  doi = {10.1093/bioinformatics/bth930},
  keywords = {Bacterial, Binding Sites, Biological, Comparative Study, DNA, Energy
	Metabolism, Enzyme Induction, Enzymes, Escherichia coli Proteins,
	Fungal, Gene Expression Regulation, Genes, Genetic, Genome, Models,
	Non-P.H.S., Non-U.S. Gov't, Phylogeny, Promoter Regions (Genetics),
	Protein, Research Support, Saccharomyces cerevisiae, Saccharomyces
	cerevisiae Proteins, Sequence Analysis, Systems Biology, Transcription
	Factors, U.S. Gov't, 15262797},
  pii = {20/suppl_1/i178},
  pmid = {15262797},
  timestamp = {2006.11.21},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth930}
}
@article{Kim2004Predictiona,
  author = {Kim, H. and Park, H.},
  title = {Prediction of protein relative solvent accessibility with support
	vector machines and long-range interaction 3{D} local descriptor},
  journal = {Proteins},
  year = {2004},
  volume = {54},
  pages = {557-562},
  number = {3},
  month = {Feb},
  abstract = {The prediction of protein relative solvent accessibility gives us
	helpful information for the prediction of tertiary structure of a
	protein. {T}he {SVM}psi method, which uses support vector machines
	({SVM}s), and the position-specific scoring matrix ({PSSM}) generated
	from {PSI}-{BLAST} have been applied to achieve better prediction
	accuracy of the relative solvent accessibility. {W}e have introduced
	a three-dimensional local descriptor that contains information about
	the expected remote contacts by both the long-range interaction matrix
	and neighbor sequences. {M}oreover, we applied feature weights to
	kernels in {SVM}s in order to consider the degree of significance
	that depends on the distance from the specific amino acid. {R}elative
	solvent accessibility based on a two state-model, for 25%, 16%, 5%,
	and 0% accessibility are predicted at 78.7%, 80.7%, 82.4%, and 87.4%
	accuracy, respectively. {T}hree-state prediction results provide
	a 64.5% accuracy with 9%; 36% threshold. {T}he support vector machine
	approach has successfully been applied for solvent accessibility
	prediction by considering long-range interaction and handling unbalanced
	data.},
  doi = {10.1002/prot.10602},
  pdf = {../local/Kim2004Predictiona.pdf},
  file = {Kim2004Predictiona.pdf:local/Kim2004Predictiona.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.10602}
}
@article{Kim2003Protein,
  author = {Kim, H. and Park, H.},
  title = {Protein secondary structure prediction based on an improved support
	vector machines approach},
  journal = {Protein {E}ng.},
  year = {2003},
  volume = {16},
  pages = {553-560},
  number = {8},
  month = {Aug},
  abstract = {The prediction of protein secondary structure is an important step
	in the prediction of protein tertiary structure. {A} new protein
	secondary structure prediction method, {SVM}psi, was developed to
	improve the current level of prediction by incorporating new tertiary
	classifiers and their jury decision system, and the {PSI}-{BLAST}
	{PSSM} profiles. {A}dditionally, efficient methods to handle unbalanced
	data and a new optimization strategy for maximizing the {Q}3 measure
	were developed. {T}he {SVM}psi produces the highest published {Q}3
	and {SOV}94 scores on both the {RS}126 and {CB}513 data sets to date.
	{F}or a new {KP}480 set, the prediction accuracy of {SVM}psi was
	{Q}3 = 78.5% and {SOV}94 = 82.8%. {M}oreover, the blind test results
	for 136 non-redundant protein sequences which do not contain homologues
	of training data sets were {Q}3 = 77.2% and {SOV}94 = 81.8%. {T}he
	{SVM}psi results in {CASP}5 illustrate that it is another competitive
	method to predict protein secondary structure.},
  pdf = {../local/Kim2003Protein.pdf},
  file = {Kim2003Protein.pdf:local/Kim2003Protein.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://peds.oupjournals.org/cgi/content/abstract/16/8/553}
}
@unpublished{Kim2001Evolving,
  author = {J. Kim and P.L. Krapivsky and B. Kahng and S. Redner},
  title = {Evolving protein interaction networks},
  note = {E-print cond-mat/0203167},
  year = {2001},
  pdf = {../local/kim02.pdf},
  file = {kim02.pdf:local/kim02.pdf:PDF},
  subject = {bionetprot},
  url = {http://xxx.lanl.gov/abs/cond-mat/0203167}
}
@article{Kim2004Prediction,
  author = {Kim, J. H. and Lee, J. and Oh, B. and Kimm, K. and Koh, I.},
  title = {Prediction of phosphorylation sites using {SVM}s},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {3179-3184},
  number = {17},
  abstract = {Motivation: {P}hosphorylation is involved in diverse signal transduction
	pathways. {B}y predicting phosphorylation sites and their kinases
	from primary protein sequences, we can obtain much valuable information
	that can form the basis for further research. {U}sing support vector
	machines, we attempted to predict phosphorylation sites and the type
	of kinase that acts at each site. {R}esults: {O}ur prediction system
	was limited to phosphorylation sites catalyzed by four protein kinase
	families and four protein kinase groups. {T}he accuracy of the predictions
	ranged from 83 to 95% at the kinase family level, and 76-91% at the
	kinase group level. {T}he prediction system used--{P}red{P}hospho--can
	be applied to the functional study of proteins, and can help predict
	the changes in phosphorylation sites caused by amino acid variations
	at intra- and interspecies levels. {A}vailability: {P}red{P}hospho
	is available at http://www.ngri.re.kr/proteo/{P}red{P}hospho.htm.
	{S}upplementary information: http://www.ngri.re.kr/proteo/supplementary.doc},
  doi = {10.1093/bioinformatics/bth382},
  pdf = {../local/Kim2004Prediction.pdf},
  file = {Kim2004Prediction.pdf:local/Kim2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3179}
}
@article{Kim2004Emotion,
  author = {K. H. Kim and S. W. Bang and S. R. Kim},
  title = {Emotion recognition system using short-term monitoring of physiological
	signals.},
  journal = {Med {B}iol {E}ng {C}omput},
  year = {2004},
  volume = {42},
  pages = {419-27},
  number = {3},
  month = {May},
  abstract = {A physiological signal-based emotion recognition system is reported.
	{T}he system was developed to operate as a user-independent system,
	based on physiological signal databases obtained from multiple subjects.
	{T}he input signals were electrocardiogram, skin temperature variation
	and electrodermal activity, all of which were acquired without much
	discomfort from the body surface, and can reflect the influence of
	emotion on the autonomic nervous system. {T}he system consisted of
	preprocessing, feature extraction and pattern classification stages.
	{P}reprocessing and feature extraction methods were devised so that
	emotion-specific characteristics could be extracted from short-segment
	signals. {A}lthough the features were carefully extracted, their
	distribution formed a classification problem, with large overlap
	among clusters and large variance within clusters. {A} support vector
	machine was adopted as a pattern classifier to resolve this difficulty.
	{C}orrect-classification ratios for 50 subjects were 78.4\% and 61.8\%,
	for the recognition of three and four categories, respectively.},
  keywords = {Algorithms, Animals, Antisense, Artificial Intelligence, Autonomic
	Nervous System, Cell Line, Child, Cluster Analysis, Comparative Study,
	Computational Biology, Computer Simulation, Computer-Assisted, DNA
	Fingerprinting, Drug Evaluation, Emotions, Fluorescence, Fuzzy Logic,
	Gene Silencing, Gene Targeting, Genetic, Hela Cells, Humans, Imaging,
	Intracellular Space, Microscopy, Models, Monitoring, Neoplasms, Neural
	Networks (Computer), Non-U.S. Gov't, Oligonucleotides, P.H.S., Physiologic,
	Preclinical, Preschool, Prognosis, Proteomics, Quantitative Structure-Activity
	Relationship, RNA, RNA Interference, Recognition (Psychology), Research
	Support, Sensitivity and Specificity, Signal Processing, Small Interfering,
	Thionucleotides, Three-Dimensional, Tumor, U.S. Gov't, User-Computer
	Interface, 15191089}
}
@inproceedings{Kin2002Marginalized,
  author = {Kin, T. and Tsuda, K. and Asai, K.},
  title = {Marginalized kernels for {RNA} sequence data analysis},
  booktitle = {Genome {I}nformatics 2002},
  year = {2002},
  editor = {Lathtop, R.H. and Nakai, K. and Miyano, S. and Takagi, T. and Kanehisa,
	M.},
  pages = {112-122},
  publisher = {Universal Academic Press},
  abstract = {We present novel kernels that measure similarity of two {RNA} sequences,
	taking account of their secondary structures. {T}wo types of kernels
	are presented. {O}ne is for {RNA} sequences with known secondary
	structures, the other for those without known secondary structures.
	{T}he latter employs stochastic context-free grammar ({SCFG}) for
	estimating the secondary structure. {W}e call the latter the {\it
	marginalized count kernel} ({MCK}). {W}e show computational experiments
	for {MCK} using 74 sets of human t{RNA} sequence data: (i) kernel
	principal component analysis ({PCA}) for visualizing t{RNA} similarities,
	(ii) supervised classification with support vector machines ({SVM}s).
	{B}oth types of experiment show promising results for {MCK}s.},
  pdf = {../local/Kin2002Marginalized.pdf},
  file = {Kin2002Marginalized.pdf:local/Kin2002Marginalized.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.jsbi.org/journal/GIW02/GIW02F012.html}
}
@article{Kohlmann2004Pediatric,
  author = {Kohlmann, A. and Schoch, C. and Schnittger, S. and Dugas, M. and
	Hiddemann, W. and Kern, W. and Haferlach, T.},
  title = {Pediatric acute lymphoblastic leukemia ({ALL}) gene expression signatures
	classify an independent cohort of adult {ALL} patients},
  journal = {Leukemia},
  year = {2004},
  volume = {18},
  pages = {63-71},
  number = {1},
  abstract = {Recent reports support a possible future application of gene expression
	profiling for the diagnosis of leukemias. {H}owever, the robustness
	of subtype-specific gene expression signatures has to be proven on
	independent patient samples. {H}ere, we present gene expression data
	of 34 adult acute lymphoblastic leukemia ({ALL}) patients ({A}ffymetrix
	{U}133{A} microarrays). {S}upport {V}ector {M}achines ({SVM}s) were
	applied to stratify our samples based on given gene lists reported
	to predict {MLL}, {BCR}-{ABL}, and {T}-{ALL}, as well as {MLL} and
	non-{MLL} gene rearrangement positive pediatric {ALL}. {I}n addition,
	seven other {B}-precursor {ALL} cases not bearing t(9;22) or t(11q23)/{MLL}
	chromosomal aberrations were analyzed. {U}sing top differentially
	expressed genes, hierarchical cluster and principal component analyses
	demonstrate that the genetically more heterogeneous {B}-precursor
	{ALL} samples intercalate with {BCR}-{ABL}-positive cases, but were
	clearly distinct from {T}-{ALL} and {MLL} profiles. {S}imilar expression
	signatures were observed for both heterogeneous {B}-precursor {ALL}
	and for {BCR}-{ABL}-positive cases. {A}s an unrelated laboratory,
	we demonstrate that gene signatures defined for childhood {ALL} were
	also capable of stratifying distinct subtypes in our cohort of adult
	{ALL} patients. {A}s such, previously reported gene expression patterns
	identified by microarray technology are validated and confirmed on
	truly independent leukemia patient samples.},
  doi = {10.1038/sj.leu.2403167},
  pdf = {../local/Kohlmann2004Pediatric.pdf},
  file = {Kohlmann2004Pediatric.pdf:local/Kohlmann2004Pediatric.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1038/sj.leu.2403167}
}
@article{Koike2004Prediction,
  author = {Koike, A. and Takagi, T.},
  title = {Prediction of protein-protein interaction sites using support vector
	machines},
  journal = {Protein {E}ng. {D}es. {S}el.},
  year = {2004},
  volume = {17},
  pages = {165-173},
  number = {2},
  month = {Feb},
  abstract = {The identification of protein-protein interaction sites is essential
	for the mutant design and prediction of protein-protein networks.
	{T}he interaction sites of residue units were predicted using support
	vector machines ({SVM}) and the profiles of sequentially/spatially
	neighboring residues, plus additional information. {W}hen only sequence
	information was used, prediction performance was highest using the
	feature vectors, sequentially neighboring profiles and predicted
	interaction site ratios, which were calculated by {SVM} regression
	using amino acid compositions. {W}hen structural information was
	also used, prediction performance was highest using the feature vectors,
	spatially neighboring residue profiles, accessible surface areas,
	and the with/without protein interaction sites ratios predicted by
	{SVM} regression and amino acid compositions. {I}n the latter case,
	the precision at recall = 50% was 54-56% for a homo-hetero mixed
	test set and >20% higher than for random prediction. {A}pproximately
	30% of the residues wrongly predicted as interaction sites were the
	closest sequentially/spatially neighboring on the interaction site
	residues. {T}he predicted residues covered 86-87% of the actual interfaces
	(96-97% of interfaces with over 20 residues). {T}his prediction performance
	appeared to be slightly higher than a previously reported study.
	{C}omparing the prediction accuracy of each molecule, it seems to
	be easier to predict interaction sites for stable complexes.},
  doi = {10.1093/protein/gzh020},
  pdf = {../local/Koike2004Prediction.pdf},
  file = {Koike2004Prediction.pdf:local/Koike2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/protein/gzh020}
}
@article{Komura2005Multidimensional,
  author = {Komura, D. and Nakamura, H. and Tsutsumi, S. and Aburatani, H. and
	Ihara, S.},
  title = {Multidimensional support vector machines for visualization of gene
	expression data},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {439-444},
  number = {4},
  month = {Feb},
  abstract = {Motivation: {S}ince {DNA} microarray experiments provide us with huge
	amount of gene expression data, they should be analyzed with statistical
	methods to extract the meanings of experimental results. {S}ome dimensionality
	reduction methods such as {P}rincipal {C}omponent {A}nalysis ({PCA})
	are used to roughly visualize the distribution of high dimensional
	gene expression data. {H}owever, in the case of binary classification
	of gene expression data, {PCA} does not utilize class information
	when choosing axes. {T}hus clearly separable data in the original
	space may not be so in the reduced space used in {PCA}.{R}esults:
	{F}or visualization and class prediction of gene expression data,
	we have developed a new {SVM}-based method called multidimensional
	{SVM}s, that generate multiple orthogonal axes. {T}his method projects
	high dimensional data into lower dimensional space to exhibit properties
	of the data clearly and to visualize a distribution of the data roughly.
	{F}urthermore, the multiple axes can be used for class prediction.
	{T}he basic properties of conventional {SVM}s are retained in our
	method: solutions of mathematical programming are sparse, and nonlinear
	classification is implemented implicitly through the use of kernel
	functions. {T}he application of our method to the experimentally
	obtained gene expression datasets for patients' samples indicates
	that our algorithm is efficient and useful for visualization and
	class prediction.},
  doi = {10.1093/bioinformatics/bti188},
  pdf = {../local/Komura2005Multidimensional.pdf},
  file = {Komura2005Multidimensional.pdf:local/Komura2005Multidimensional.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti188v1}
}
@incollection{Kondor2004Diffusion,
  author = {Kondor, R. and Vert, J.-P.},
  title = {Diffusion kernels},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {171-192},
  pdf = {../local/saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF;saigo.pdf:http\},
  file = {saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF;saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@inproceedings{Kondor2002Diffusion,
  author = {R. I. Kondor and J. Lafferty},
  title = {Diffusion kernels on graphs and other discrete input},
  booktitle = {Proceedings of the Nineteenth International Conference on Machine
	Learning},
  year = {2002},
  pages = {315--322},
  address = {San Francisco, CA, USA},
  publisher = {Morgan Kaufmann Publishers Inc.},
  pdf = {../local/Kondor2002Diffusion.pdf},
  file = {Kondor2002Diffusion.pdf:Kondor2002Diffusion.pdf:PDF},
  keywords = {biosvm},
  subject = {kernelnet}
}
@article{Kononen1998Tissue,
  author = {J. Kononen and L. Bubendorf and A. Kallioniemi and M. Bärlund and
	P. Schraml and S. Leighton and J. Torhorst and M. J. Mihatsch and
	G. Sauter and O. P. Kallioniemi},
  title = {Tissue microarrays for high-throughput molecular profiling of tumor
	specimens.},
  journal = {Nat Med},
  year = {1998},
  volume = {4},
  pages = {844--847},
  number = {7},
  month = {Jul},
  abstract = {Many genes and signalling pathways controlling cell proliferation,
	death and differentiation, as well as genomic integrity, are involved
	in cancer development. New techniques, such as serial analysis of
	gene expression and cDNA microarrays, have enabled measurement of
	the expression of thousands of genes in a single experiment, revealing
	many new, potentially important cancer genes. These genome screening
	tools can comprehensively survey one tumor at a time; however, analysis
	of hundreds of specimens from patients in different stages of disease
	is needed to establish the diagnostic, prognostic and therapeutic
	importance of each of the emerging cancer gene candidates. Here we
	have developed an array-based high-throughput technique that facilitates
	gene expression and copy number surveys of very large numbers of
	tumors. As many as 1000 cylindrical tissue biopsies from individual
	tumors can be distributed in a single tumor tissue microarray. Sections
	of the microarray provide targets for parallel in situ detection
	of DNA, RNA and protein targets in each specimen on the array, and
	consecutive sections allow the rapid analysis of hundreds of molecular
	markers in the same set of specimens. Our detection of six gene amplifications
	as well as p53 and estrogen receptor expression in breast cancer
	demonstrates the power of this technique for defining new subgroups
	of tumors.},
  institution = {Laboratory of Cancer Genetics, National Human Genome Research Institute,
	National Institutes of Health, Bethesda, MD 20892-4470, USA.},
  keywords = {Animals; Breast Neoplasms, genetics/metabolism/pathology; Cyclin D1,
	genetics/metabolism; Female; Genetic Techniques; Humans; Immunoenzyme
	Techniques; In Situ Hybridization, Fluorescence; Mice; Oncogene Proteins
	v-myb; Proto-Oncogene Proteins c-myc, genetics/metabolism; Rabbits;
	Receptor, erbB-2, genetics/metabolism; Receptors, Estrogen, genetics/metabolism;
	Retroviridae Proteins, Oncogenic, genetics/metabolism; Tumor Markers,
	Biological, genetics/metabolism; Tumor Suppressor Protein p53, genetics/metabolism},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pmid = {9662379},
  timestamp = {2010.08.08}
}
@article{Korber2006Immunoinformatics,
  author = {Bette Korber and Montiago LaBute and Karina Yusim},
  title = {Immunoinformatics comes of age.},
  journal = {PLoS Comput. Biol.},
  year = {2006},
  volume = {2},
  pages = {e71},
  number = {6},
  month = {Jun},
  abstract = {With the burgeoning immunological data in the scientific literature,
	scientists must increasingly rely on Internet resources to inform
	and enhance their work. Here we provide a brief overview of the adaptive
	immune response and summaries of immunoinformatics resources, emphasizing
	those with Web interfaces. These resources include searchable databases
	of epitopes and immune-related molecules, and analysis tools for
	T cell and B cell epitope prediction, vaccine design, and protein
	structure comparisons. There is an agreeable synergy between the
	growing collections in immune-related databases and the growing sophistication
	of analysis software; the databases provide the foundation for developing
	predictive computational tools, which in turn enable more rapid identification
	of immune responses to populate the databases. Collectively, these
	resources contribute to improved understanding of immune responses
	and escape, and evolution of pathogens under immune pressure. The
	public health implications are vast, including designing vaccines,
	understanding autoimmune diseases, and defining the correlates of
	immune protection.},
  doi = {10.1371/journal.pcbi.0020071},
  keywords = {Amino Acid Sequence; Animals; Computational Biology; Databases, Factual;
	Epitopes, B-Lymphocyte; Epitopes, T-Lymphocyte; Humans; Immunity},
  owner = {laurent},
  pii = {06-PLCB-RV-0068},
  pmid = {16846250},
  timestamp = {2007.08.23},
  url = {http://dx.doi.org/10.1371/journal.pcbi.0020071}
}
@article{Kote-Jarai2004Gene,
  author = {Zsofia Kote-Jarai and Richard D Williams and Nicola Cattini and Maria
	Copeland and Ian Giddings and Richard Wooster and Robert H tePoele
	and Paul Workman and Barry Gusterson and John Peacock and Gerald
	Gui and Colin Campbell and Ros Eeles},
  title = {Gene expression profiling after radiation-induced {DNA} damage is
	strongly predictive of {BRCA}1 mutation carrier status.},
  journal = {Clin. {C}ancer {R}es.},
  year = {2004},
  volume = {10},
  pages = {958-63},
  number = {3},
  month = {Feb},
  abstract = {P{URPOSE}: {T}he impact of the presence of a germ-line {BRCA}1 mutation
	on gene expression in normal breast fibroblasts after radiation-induced
	{DNA} damage has been investigated. {EXPERIMENTAL} {DESIGN}: {H}igh-density
	c{DNA} microarray technology was used to identify differential responses
	to {DNA} damage in fibroblasts from nine heterozygous {BRCA}1 mutation
	carriers compared with five control samples without personal or family
	history of any cancer. {F}ibroblast cultures were irradiated, and
	their expression profile was compared using intensity ratios of the
	c{DNA} microarrays representing 5603 {IMAGE} clones. {RESULTS}: {C}lass
	comparison and class prediction analysis has shown that {BRCA}1 mutation
	carriers can be distinguished from controls with high probability
	(approximately 85\%). {S}ignificance analysis of microarrays and
	the support vector machine classifier identified gene sets that discriminate
	the samples according to their mutation status. {T}hese include genes
	already known to interact with {BRCA}1 such as {CDKN}1{B}, {ATR},
	and {RAD}51. {CONCLUSIONS}: {T}he results of this initial study suggest
	that normal cells from heterozygous {BRCA}1 mutation carriers display
	a different gene expression profile from controls in response to
	{DNA} damage. {A}daptations of this pilot result to other cell types
	could result in the development of a functional assay for {BRCA}1
	mutation status.},
  pdf = {../local/Kote-Jarai2004Gene.pdf},
  file = {Kote-Jarai2004Gene.pdf:local/Kote-Jarai2004Gene.pdf:PDF},
  keywords = {biosvm , breastcancer},
  url = {http://clincancerres.aacrjournals.org/cgi/content/abstract/10/3/958}
}
@article{Kramer2002Fragment,
  author = {S. Kramer and E. Frank and C. Helma},
  title = {Fragment generation and support vector machines for inducing {SAR}s.},
  journal = {S{AR} {QSAR} {E}nviron {R}es},
  year = {2002},
  volume = {13},
  pages = {509-23},
  number = {5},
  month = {Jul},
  abstract = {We present a new approach to the induction of {SAR}s based on the
	generation of structural fragments and support vector machines ({SVM}s).
	{I}t is tailored for bio-chemical databases, where the examples are
	two-dimensional descriptions of chemical compounds. {T}he fragment
	generator finds all fragments (i.e. linearly connected atoms) that
	satisfy user-specified constraints regarding their frequency and
	generality. {I}n this paper, we are querying for fragments within
	a minimum and a maximum frequency in the dataset. {A}fter fragment
	generation, we propose to apply {SVM}s to the problem of inducing
	{SAR}s from these fragments. {W}e conjecture that the {SVM}s are
	particularly useful in this context, as they can deal with a large
	number of features. {E}xperiments in the domains of carcinogenicity
	and mutagenicity prediction show that the minimum and the maximum
	frequency queries for fragments can be answered within a reasonable
	time, and that the predictive accuracy obtained using these fragments
	is satisfactory. {H}owever, further experiments will have to confirm
	that this is a viable approach to inducing {SAR}s.},
  doi = {10.1080/10629360290023340},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1080/10629360290023340}
}
@article{Krishnan2003comparative,
  author = {Krishnan, V. G. and Westhead, D. R.},
  title = {A comparative study of machine-learning methods to predict the effects
	of single nucleotide polymorphisms on protein function},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {2199-2209},
  number = {17},
  abstract = {Motivation: {T}he large volume of single nucleotide polymorphism data
	now available motivates the development of methods for distinguishing
	neutral changes from those which have real biological effects. {H}ere,
	two different machine-learning methods, decision trees and support
	vector machines ({SVM}s), are applied for the first time to this
	problem. {I}n common with most other methods, only non-synonymous
	changes in protein coding regions of the genome are considered. {R}esults:
	{I}n detailed cross-validation analysis, both learning methods are
	shown to compete well with existing methods, and to out-perform them
	in some key tests. {SVM}s show better generalization performance,
	but decision trees have the advantage of generating interpretable
	rules with robust estimates of prediction confidence. {I}t is shown
	that the inclusion of protein structure information produces more
	accurate methods, in agreement with other recent studies, and the
	effect of using predicted rather than actual structure is evaluated.
	{A}vailability: {S}oftware is available on request from the authors.},
  pdf = {../local/Krishnan2003comparative.pdf},
  file = {Krishnan2003comparative.pdf:local/Krishnan2003comparative.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/17/2199}
}
@incollection{Krishnapuram2004Gene,
  author = {Krishnapuram, B. and Carin, L. and Hartemink, A.},
  title = {Gene expression analysis: joint feature selection and classifier
	design},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {299-317},
  pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\},
  file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Krishnapuram2004Joint,
  author = {Krishnapuram, B. and Carin, L. and Hartemink, A.},
  title = {Joint {C}lassifier and {F}eature {O}ptimization for {C}omprehensive
	{C}ancer {D}iagnosis {U}sing {G}ene {E}xpression {D}ata},
  journal = {J. {C}omput. {B}iol.},
  year = {2004},
  volume = {11},
  pages = {227-242},
  number = {2-3},
  abstract = {ecent research has demonstrated quite convincingly that accurate cancer
	diagnosis can be achieved by constructing classifiers that are designed
	to compare the gene expression profile of a tissue of unknown cancer
	status to a database of stored expression profiles from tissues of
	known cancer status. {T}his paper introduces the {JCFO}, a novel
	algorithm that uses a sparse {B}ayesian approach to jointly identify
	both the optimal nonlinear classifier for diagnosis and the optimal
	set of genes on which to base that diagnosis. {W}e show that the
	diagnostic classification accuracy of the proposed algorithm is superior
	to a number of current state-of-the-art methods in a full leave-one-out
	cross-validation study of five widely used benchmark datasets. {I}n
	addition to its superior classification accuracy, the algorithm is
	designed to automatically identify a small subset of genes (typically
	around twenty in our experiments) that are capable of providing complete
	discriminatory information for diagnosis. {F}ocusing attention on
	a small subset of genes is useful not only because it produces a
	classifier with good generalization capacity, but also because this
	set of genes may provide insights into the mechanisms responsible
	for the disease itself. {A} number of the genes identified by the
	{JCFO} in our experiments are already in use as clinical markers
	for cancer diagnosis; some of the remaining genes may be excellent
	candidates for further clinical investigation. {I}f it is possible
	to identify a small set of genes that is indeed capable of providing
	complete discrimination, inexpensive diagnostic assays might be widely
	deployable in clinical settings.},
  doi = {10.1089/1066527041410463},
  pdf = {../local/Krishnapuram2004Joint.pdf},
  file = {Krishnapuram2004Joint.pdf:local/Krishnapuram2004Joint.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1089/1066527041410463}
}
@article{Krishnapuram2004bayesian,
  author = {Krishnapuram, B. and Hartemink, A. J. and Carin, L. and Figueiredo,
	M. A. T.},
  title = {A bayesian approach to joint feature selection and classifier design},
  journal = {IEEE T. Pattern. Anal.},
  year = {2004},
  volume = {26},
  pages = {1105-11},
  number = {9},
  month = {Sep},
  abstract = {This paper adopts a {B}ayesian approach to simultaneously learn both
	an optimal nonlinear classifier and a subset of predictor variables
	(or features) that are most relevant to the classification task.
	{T}he approach uses heavy-tailed priors to promote sparsity in the
	utilization of both basis functions and features; these priors act
	as regularizers for the likelihood function that rewards good classification
	on the training data. {W}e derive an expectation-maximization ({EM})
	algorithm to efficiently compute a maximum a posteriori ({MAP}) point
	estimate of the various parameters. {T}he algorithm is an extension
	of recent state-of-the-art sparse {B}ayesian classifiers, which in
	turn can be seen as {B}ayesian counterparts of support vector machines.
	{E}xperimental comparisons using kernel classifiers demonstrate both
	parsimonious feature selection and excellent classification accuracy
	on a range of synthetic and benchmark data sets.},
  doi = {10.1109/TPAMI.2004.55},
  pdf = {../local/Krishnapuram2004bayesian.pdf},
  file = {Krishnapuram2004bayesian.pdf:local/Krishnapuram2004bayesian.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1109/TPAMI.2004.55}
}
@article{Kroemer2007Structure,
  author = {Romano T Kroemer},
  title = {Structure-based drug design: docking and scoring.},
  journal = {Curr. Protein Pept. Sci.},
  year = {2007},
  volume = {8},
  pages = {312--328},
  number = {4},
  month = {Aug},
  abstract = {This review gives an introduction into ligand - receptor docking and
	illustrates the basic underlying concepts. An overview of different
	approaches and algorithms is provided. Although the application of
	docking and scoring has led to some remarkable successes, there are
	still some major challenges ahead, which are outlined here as well.
	Approaches to address some of these challenges and the latest developments
	in the area are presented. Some aspects of the assessment of docking
	program performance are discussed. A number of successful applications
	of structure-based virtual screening are described.},
  institution = {ciences, Department of Chemistry, Nerviano Medical Sciences, Viale
	Pasteur 10, 20014 Nerviano (MI), Italy. romano.kroemer@sanofi-aventis.com},
  keywords = {Algorithms; Artificial Intelligence; Computational Biology; Computer
	Simulation; Computer-Aided Design; Drug Design; Imaging, Three-Dimensional;
	Ligands; Models, Molecular; Protein Binding; Protein Conformation;
	Software; Structure-Activity Relationship},
  owner = {bricehoffmann},
  pmid = {17696866},
  timestamp = {2009.02.13}
}
@article{Kuang2005Profile-based,
  author = {Kuang, R. and Ie, E. and Wang, K. and Wang, K. and Siddiqi, M. and
	Freund, Y. and Leslie, C.},
  title = {Profile-based string kernels for remote homology detection and motif
	extraction.},
  journal = {J. Bioinform. Comput. Biol.},
  year = {2005},
  volume = {3},
  pages = {527--550},
  number = {3},
  month = {Jun},
  abstract = {We introduce novel profile-based string kernels for use with support
	vector machines (SVMs) for the problems of protein classification
	and remote homology detection. These kernels use probabilistic profiles,
	such as those produced by the PSI-BLAST algorithm, to define position-dependent
	mutation neighborhoods along protein sequences for inexact matching
	of k-length subsequences ("k-mers") in the data. By use of an efficient
	data structure, the kernels are fast to compute once the profiles
	have been obtained. For example, the time needed to run PSI-BLAST
	in order to build the profiles is significantly longer than both
	the kernel computation time and the SVM training time. We present
	remote homology detection experiments based on the SCOP database
	where we show that profile-based string kernels used with SVM classifiers
	strongly outperform all recently presented supervised SVM methods.
	We further examine how to incorporate predicted secondary structure
	information into the profile kernel to obtain a small but significant
	performance improvement. We also show how we can use the learned
	SVM classifier to extract "discriminative sequence motifs"--short
	regions of the original profile that contribute almost all the weight
	of the SVM classification score--and show that these discriminative
	motifs correspond to meaningful structural features in the protein
	data. The use of PSI-BLAST profiles can be seen as a semi-supervised
	learning technique, since PSI-BLAST leverages unlabeled data from
	a large sequence database to build more informative profiles. Recently
	presented "cluster kernels" give general semi-supervised methods
	for improving SVM protein classification performance. We show that
	our profile kernel results also outperform cluster kernels while
	providing much better scalability to large datasets.},
  keywords = {biosvm},
  owner = {vert},
  pii = {S021972000500120X},
  pmid = {16108083},
  timestamp = {2007.08.01}
}
@article{Kuang2004Profile-based,
  author = {Kuang, R. and Ie, E. and Wang, K. and Wang, K. and Siddiqi, M. and
	Freund, Y. and Leslie, C.},
  title = {Profile-based string kernels for remote homology detection and motif
	extraction.},
  journal = {Proc IEEE Comput Syst Bioinform Conf},
  year = {2004},
  pages = {152--160},
  abstract = {We introduce novel profile-based string kernels for use with support
	vector machines (SVMs) for the problems of protein classification
	and remote homology detection. These kernels use probabilistic profiles,
	such as those produced by the PSI-BLAST algorithm, to define position-dependent
	mutation neighborhoods along protein sequences for inexact matching
	of k-length subsequences ("k-mers") in the data. By use of an efficient
	data structure, the kernels are fast to compute once the profiles
	have been obtained. For example, the time needed to run PSI-BLAST
	in order to build the pro- files is significantly longer than both
	the kernel computation time and the SVM training time. We present
	remote homology detection experiments based on the SCOP database
	where we show that profile-based string kernels used with SVM classifiers
	strongly outperform all recently presented supervised SVM methods.
	We also show how we can use the learned SVM classifier to extract
	"discriminative sequence motifs" -- short regions of the original
	profile that contribute almost all the weight of the SVM classification
	score -- and show that these discriminative motifs correspond to
	meaningful structural features in the protein data. The use of PSI-BLAST
	profiles can be seen as a semi-supervised learning technique, since
	PSI-BLAST leverages unlabeled data from a large sequence database
	to build more informative profiles. Recently presented "cluster kernels"
	give general semi-supervised methods for improving SVM protein classification
	performance. We show that our profile kernel results are comparable
	to cluster kernels while providing much better scalability to large
	datasets.},
  keywords = {biosvm},
  owner = {vert},
  pmid = {16448009},
  timestamp = {2007.08.01}
}
@article{Kuang2004Protein,
  author = {Kuang, R. and Leslie, C. S. and Yang, A.-S.},
  title = {Protein backbone angle prediction with machine learning approaches},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {1612-1621},
  number = {10},
  abstract = {Motivation: {P}rotein backbone torsion angle prediction provides useful
	local structural information that goes beyond conventional three-state
	({alpha}, {beta} and coil) secondary structure predictions. {A}ccurate
	prediction of protein backbone torsion angles will substantially
	improve modeling procedures for local structures of protein sequence
	segments, especially in modeling loop conformations that do not form
	regular structures as in {alpha}-helices or {beta}-strands. {R}esults:
	{W}e have devised two novel automated methods in protein backbone
	conformational state prediction: one method is based on support vector
	machines ({SVM}s); the other method combines a standard feed-forward
	back-propagation artificial neural network ({NN}) with a local structure-based
	sequence profile database ({LSBSP}1). {E}xtensive benchmark experiments
	demonstrate that both methods have improved the prediction accuracy
	rate over the previously published methods for conformation state
	prediction when using an alphabet of three or four states. {A}vailability:
	{LSBSP}1 and the {NN} algorithm have been implemented in {P}r{ISM}.1,
	which is available from www.columbia.edu/~ay1/. {S}upplementary information:
	{S}upplementary data for the {SVM} method can be downloaded from
	the {W}ebsite www.cs.columbia.edu/compbio/backbone.},
  pdf = {../local/Kuang2004Protein.pdf},
  file = {Kuang2004Protein.pdf:local/Kuang2004Protein.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/10/1612}
}
@article{Kumar2005BhairPred,
  author = {Kumar, M. and Bhasin, M. and Natt, N. K. and Raghava, G. P. S.},
  title = {Bhair{P}red: prediction of beta-hairpins in a protein from multiple
	alignment information using {ANN} and {SVM} techniques.},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {W154-9},
  number = {Web Server issue},
  month = {Jul},
  abstract = {This paper describes a method for predicting a supersecondary structural
	motif, beta-hairpins, in a protein sequence. {T}he method was trained
	and tested on a set of 5102 hairpins and 5131 non-hairpins, obtained
	from a non-redundant dataset of 2880 proteins using the {DSSP} and
	{PROMOTIF} programs. {T}wo machine-learning techniques, an artificial
	neural network ({ANN}) and a support vector machine ({SVM}), were
	used to predict beta-hairpins. {A}n accuracy of 65.5\% was achieved
	using {ANN} when an amino acid sequence was used as the input. {T}he
	accuracy improved from 65.5 to 69.1\% when evolutionary information
	({PSI}-{BLAST} profile), observed secondary structure and surface
	accessibility were used as the inputs. {T}he accuracy of the method
	further improved from 69.1 to 79.2\% when the {SVM} was used for
	classification instead of the {ANN}. {T}he performances of the methods
	developed were assessed in a test case, where predicted secondary
	structure and surface accessibility were used instead of the observed
	structure. {T}he highest accuracy achieved by the {SVM} based method
	in the test case was 77.9\%. {A} maximum accuracy of 71.1\% with
	{M}atthew's correlation coefficient of 0.41 in the test case was
	obtained on a dataset previously used by {X}. {C}ruz, {E}. {G}. {H}utchinson,
	{A}. {S}hephard and {J}. {M}. {T}hornton (2002) {P}roc. {N}atl {A}cad.
	{S}ci. {USA}, 99, 11157-11162. {T}he performance of the method was
	also evaluated on proteins used in the '6th community-wide experiment
	on the critical assessment of techniques for protein structure prediction
	({CASP}6)'. {B}ased on the algorithm described, a web server, {B}hair{P}red
	(http://www.imtech.res.in/raghava/bhairpred/), has been developed,
	which can be used to predict beta-hairpins in a protein using the
	{SVM} approach.},
  doi = {doi:10.1093/nar/gki588},
  pdf = {../local/Kumar2005BhairPred.pdf},
  file = {Kumar2005BhairPred.pdf:local/Kumar2005BhairPred.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/doi:10.1093/nar/gki588}
}
@article{Kurata2006PlosCompBio,
  author = {Hiroyuki Kurata and Hana El-Samad and Rei Iwasaki and Hisao Ohtake
	and John C Doyle and Irina Grigorova and Carol A Gross and Mustafa
	Khammash},
  title = {Module-based analysis of robustness tradeoffs in the heat shock response
	system.},
  journal = {PLoS Comput Biol},
  year = {2006},
  volume = {2},
  pages = {e59},
  number = {7},
  month = {Jul},
  abstract = {Biological systems have evolved complex regulatory mechanisms, even
	in situations where much simpler designs seem to be sufficient for
	generating nominal functionality. Using module-based analysis coupled
	with rigorous mathematical comparisons, we propose that in analogy
	to control engineering architectures, the complexity of cellular
	systems and the presence of hierarchical modular structures can be
	attributed to the necessity of achieving robustness. We employ the
	Escherichia coli heat shock response system, a strongly conserved
	cellular mechanism, as an example to explore the design principles
	of such modular architectures. In the heat shock response system,
	the sigma-factor sigma32 is a central regulator that integrates multiple
	feedforward and feedback modules. Each of these modules provides
	a different type of robustness with its inherent tradeoffs in terms
	of transient response and efficiency. We demonstrate how the overall
	architecture of the system balances such tradeoffs. An extensive
	mathematical exploration nevertheless points to the existence of
	an array of alternative strategies for the existing heat shock response
	that could exhibit similar behavior. We therefore deduce that the
	evolutionary constraints facing the system might have steered its
	architecture toward one of many robustly functional solutions.},
  doi = {10.1371/journal.pcbi.0020059},
  institution = {Department of Bioscience and Bioinformatics, Kyushu Institute of
	Technology, Iizuka, Fukuoka, Japan. kurata@bio.kyutech.ac.jp},
  keywords = {Computer Simulation; Escherichia coli Proteins, metabolism; Escherichia
	coli, metabolism; Feedback, physiology; Gene Expression Regulation,
	Bacterial, physiology; Heat-Shock Proteins, metabolism; Heat-Shock
	Response, physiology; Models, Biological; Oxidative Stress, physiology;
	Signal Transduction, physiology; Systems Biology, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pii = {05-PLCB-RA-0264R4},
  pmid = {16863396},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1371/journal.pcbi.0020059}
}
@article{Kohler2008Walking,
  author = {K{\"o}hler, S. and Bauer, S. and Horn, D. and Robinson, P.N.},
  title = {Walking the interactome for prioritization of candidate disease genes.},
  journal = {Am. J. Hum. Genet.},
  year = {2008},
  volume = {82},
  pages = {949--958},
  number = {4},
  month = {Apr},
  abstract = {The identification of genes associated with hereditary disorders has
	contributed to improving medical care and to a better understanding
	of gene functions, interactions, and pathways. However, there are
	well over 1500 Mendelian disorders whose molecular basis remains
	unknown. At present, methods such as linkage analysis can identify
	the chromosomal region in which unknown disease genes are located,
	but the regions could contain up to hundreds of candidate genes.
	In this work, we present a method for prioritization of candidate
	genes by use of a global network distance measure, random walk analysis,
	for definition of similarities in protein-protein interaction networks.
	We tested our method on 110 disease-gene families with a total of
	783 genes and achieved an area under the ROC curve of up to 98\%
	on simulated linkage intervals of 100 genes surrounding the disease
	gene, significantly outperforming previous methods based on local
	distance measures. Our results not only provide an improved tool
	for positional-cloning projects but also add weight to the assumption
	that phenotypically similar diseases are associated with disturbances
	of subnetworks within the larger protein interactome that extend
	beyond the disease proteins themselves.},
  doi = {10.1016/j.ajhg.2008.02.013},
  institution = {Institute for Medical Genetics, Charité Universitätsmedizin Berlin,
	Augustenburger Platz 1, 13353 Berlin, Germany.},
  keywords = {Animals; Chromosome Mapping; Computational Biology; Databases, Genetic;
	Genetic Diseases, Inborn; Genetic Predisposition to Disease; Humans;
	Internet; Linkage (Genetics); Mice; Pedigree; Protein Interaction
	Mapping; Software},
  owner = {mordelet},
  pii = {S0002-9297(08)00172-9},
  pmid = {18371930},
  timestamp = {2010.09.28},
  url = {http://dx.doi.org/10.1016/j.ajhg.2008.02.013}
}
@article{LeCao2009Sparse,
  author = {{L\^e Cao}, K.-A. and Martin, P. G. P. and Robert-Grani\'e, C. and
	Besse, P.},
  title = {Sparse canonical methods for biological data integration: application
	to a cross-platform study.},
  journal = {BMC Bioinformatics},
  year = {2009},
  volume = {10},
  pages = {34},
  abstract = {In the context of systems biology, few sparse approaches have been
	proposed so far to integrate several data sets. It is however an
	important and fundamental issue that will be widely encountered in
	post genomic studies, when simultaneously analyzing transcriptomics,
	proteomics and metabolomics data using different platforms, so as
	to understand the mutual interactions between the different data
	sets. In this high dimensional setting, variable selection is crucial
	to give interpretable results. We focus on a sparse Partial Least
	Squares approach (sPLS) to handle two-block data sets, where the
	relationship between the two types of variables is known to be symmetric.
	Sparse PLS has been developed either for a regression or a canonical
	correlation framework and includes a built-in procedure to select
	variables while integrating data. To illustrate the canonical mode
	approach, we analyzed the NCI60 data sets, where two different platforms
	(cDNA and Affymetrix chips) were used to study the transcriptome
	of sixty cancer cell lines.We compare the results obtained with two
	other sparse or related canonical correlation approaches: CCA with
	Elastic Net penalization (CCA-EN) and Co-Inertia Analysis (CIA).
	The latter does not include a built-in procedure for variable selection
	and requires a two-step analysis. We stress the lack of statistical
	criteria to evaluate canonical correlation methods, which makes biological
	interpretation absolutely necessary to compare the different gene
	selections. We also propose comprehensive graphical representations
	of both samples and variables to facilitate the interpretation of
	the results.sPLS and CCA-EN selected highly relevant genes and complementary
	findings from the two data sets, which enabled a detailed understanding
	of the molecular characteristics of several groups of cell lines.
	These two approaches were found to bring similar results, although
	they highlighted the same phenomenons with a different priority.
	They outperformed CIA that tended to select redundant information.},
  doi = {10.1186/1471-2105-10-34},
  institution = {Station d'Amélioration Génétique des Animaux UR 631, Institut National
	de Recherche Agronomique, F-31326 Castanet, France. k.lecao@imb.uq.edu.au},
  keywords = {Computational Biology, methods; Genomics; Metabolomics; Proteomics;
	Systems Biology, methods},
  language = {eng},
  medline-pst = {epublish},
  owner = {jp},
  pii = {1471-2105-10-34},
  pmid = {19171069},
  timestamp = {2012.02.29},
  url = {http://dx.doi.org/10.1186/1471-2105-10-34}
}
@article{Lal2004Support,
  author = {Thomas Navin Lal and Michael Schröder and Thilo Hinterberger and
	Jason Weston and Martin Bogdan and Niels Birbaumer and Bernhard Schölkopf},
  title = {Support vector channel selection in {BCI}.},
  journal = {I{EEE} {T}rans {B}iomed {E}ng},
  year = {2004},
  volume = {51},
  pages = {1003-10},
  number = {6},
  month = {Jun},
  abstract = {Designing a brain computer interface ({BCI}) system one can choose
	from a variety of features that may be useful for classifying brain
	activity during a mental task. {F}or the special case of classifying
	electroencephalogram ({EEG}) signals we propose the usage of the
	state of the art feature selection algorithms {R}ecursive {F}eature
	{E}limination and {Z}ero-{N}orm {O}ptimization which are based on
	the training of support vector machines ({SVM}). {T}hese algorithms
	can provide more accurate solutions than standard filter methods
	for feature selection. {W}e adapt the methods for the purpose of
	selecting {EEG} channels. {F}or a motor imagery paradigm we show
	that the number of used channels can be reduced significantly without
	increasing the classification error. {T}he resulting best channels
	agree well with the expected underlying cortical activity patterns
	during the mental tasks. {F}urthermore we show how time dependent
	task specific information can be visualized.},
  keywords = {Algorithms, Animals, Antisense, Artificial Intelligence, Automated,
	Autonomic Nervous System, Brain, Cell Line, Cerebral Cortex, Child,
	Cluster Analysis, Cognition, Comparative Study, Computational Biology,
	Computer Simulation, Computer-Assisted, DNA Fingerprinting, Databases,
	Drug Evaluation, Electroencephalography, Emotions, Event-Related
	Potentials, Evoked Potentials, Factual, Fluorescence, Fuzzy Logic,
	Gene Silencing, Gene Targeting, Genetic, Hand, Hela Cells, Humans,
	Imaging, Intracellular Space, Male, Microscopy, Models, Monitoring,
	Motor, Neoplasms, Neural Networks (Computer), Non-U.S. Gov't, Oligonucleotides,
	P.H.S., P300, Pattern Recognition, Peptides, Physiologic, Preclinical,
	Predictive Value of Tests, Preschool, Prognosis, Protein Interaction
	Mapping, Protein Structure, Proteins, Proteomics, Quantitative Structure-Activity
	Relationship, Quaternary, RNA, RNA Interference, Recognition (Psychology),
	Reproducibility of Results, Research Support, Sensitivity and Specificity,
	Signal Processing, Small Interfering, Software, Thionucleotides,
	Three-Dimensional, Tumor, U.S. Gov't, User-Computer Interface, Word
	Processing, 15188871}
}
@incollection{Lanckriet2004Kernel-based,
  author = {Lanckriet, G.R.G. and Cristianini, N. and Jordan, M.I. and Noble,
	W.S.},
  title = {Kernel-based integration of genomic data using semidefinite programming},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {231-259},
  keywords = {biosvm},
  owner = {vert}
}
@inproceedings{Lanckriet2004Kernel-baseda,
  author = {Lanckriet, G.R. and Deng, M. and Cristianini, N. and Jordan, M.I.
	and Noble, W.S.},
  title = {Kernel-based data fusion and its application to protein function
	prediction in yeast.},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing},
  year = {2004},
  pages = {300-311},
  abstract = {Kernel methods provide a principled framework in which to represent
	many types of data, including vectors, strings, trees and graphs.
	{A}s such, these methods are useful for drawing inferences about
	biological phenomena. {W}e describe a method for combining multiple
	kernel representations in an optimal fashion, by formulating the
	problem as a convex optimization problem that can be solved using
	semidefinite programming techniques. {T}he method is applied to the
	problem of predicting yeast protein functional classifications using
	a support vector machine ({SVM}) trained on five types of data. {F}or
	this problem, the new method performs better than a previously-described
	{M}arkov random field method, and better than the {SVM} trained on
	any single type of data.},
  pdf = {../local/Lanckriet2004Kernel-baseda.pdf},
  file = {Lanckriet2004Kernel-baseda.pdf:local/Lanckriet2004Kernel-baseda.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Lanckriet2004statistical,
  author = {Lanckriet, G. R. G. and De Bie, T. and Cristianini, N. and Jordan,
	M. I. and Noble, W. S.},
  title = {A statistical framework for genomic data fusion},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {2626-2635},
  number = {16},
  abstract = {Motivation: {D}uring the past decade, the new focus on genomics has
	highlighted a particular challenge: to integrate the different views
	of the genome that are provided by various types of experimental
	data. {R}esults: {T}his paper describes a computational framework
	for integrating and drawing inferences from a collection of genome-wide
	measurements. {E}ach dataset is represented via a kernel function,
	which defines generalized similarity relationships between pairs
	of entities, such as genes or proteins. {T}he kernel representation
	is both flexible and efficient, and can be applied to many different
	types of data. {F}urthermore, kernel functions derived from different
	types of data can be combined in a straightforward fashion. {R}ecent
	advances in the theory of kernel methods have provided efficient
	algorithms to perform such combinations in a way that minimizes a
	statistical loss function. {T}hese methods exploit semidefinite programming
	techniques to reduce the problem of finding optimizing kernel combinations
	to a convex optimization problem. {C}omputational experiments performed
	using yeast genome-wide datasets, including amino acid sequences,
	hydropathy profiles, gene expression data and known protein-protein
	interactions, demonstrate the utility of this approach. {A} statistical
	learning algorithm trained from all of these data to recognize particular
	classes of proteins--membrane proteins and ribosomal proteins--performs
	significantly better than the same algorithm trained on any single
	type of data. {A}vailability: {S}upplementary data at http://noble.gs.washington.edu/proj/sdp-svm},
  doi = {10.1093/bioinformatics/bth294},
  pdf = {../local/Lanckriet2004statistical.pdf},
  file = {Lanckriet2004statistical.pdf:local/Lanckriet2004statistical.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/16/2626}
}
@article{Lazo2000Combinatorial,
  author = {J. S. Lazo and P. Wipf},
  title = {{C}ombinatorial chemistry and contemporary pharmacology.},
  journal = {J. Pharmacol. Exp. Ther.},
  year = {2000},
  volume = {293},
  pages = {705--709},
  number = {3},
  month = {Jun},
  abstract = {Both solid- and liquid-phase combinatorial chemistry have emerged
	as powerful tools for identifying pharmacologically active compounds
	and optimizing the biological activity of a lead compound. Complementary
	high-throughput in vitro assays are essential for compound evaluation.
	Cell-based assays that use optical endpoints permit investigation
	of a wide variety of functional properties of these compounds including
	specific intracellular biochemical pathways, protein-protein interactions,
	and the subcellular localization of targets. Integration of combinatorial
	chemistry with contemporary pharmacology now represents an important
	factor in drug discovery and development.},
  keywords = {Alzheimer Disease, Animals, Antineoplastic Agents, Biological, Bleomycin,
	Cell Cycle, Cell Cycle Proteins, Cell Death, Cell Line, Cell Nucleus,
	Cell Shape, Cell Transformation, Combinatorial Chemistry Techniques,
	Cultured, Drug Delivery Systems, Drug Design, Drug Evaluation, Enzyme
	Inhibitors, Formazans, Gene Expression, Humans, Inhibitory Concentration
	50, Kinetics, Magnetic Resonance Spectroscopy, Mass, Mitochondria,
	Models, Molecular, Neoplasms, Neoplastic, Non-P.H.S., Non-U.S. Gov't,
	P.H.S., Paclitaxel, Peptide Library, Pharmaceutical Preparations,
	Pharmacology, Phosphoprotein Phosphatase, Preclinical, Protease Inhibitors,
	Protein-Tyrosine-Phosphatase, Research Support, Sensitivity and Specificity,
	Signal Transduction, Spectrum Analysis, Stereoisomerism, Structure-Activity
	Relationship, Sulfonic Acids, Tetrazolium Salts, Thiazoles, Toxicity
	Tests, Tumor, Tumor Cells, U.S. Gov't, cdc25 Phosphatase, 10869367},
  owner = {mahe},
  pmid = {10869367},
  timestamp = {2006.08.22}
}
@article{Lee2003Discovery,
  author = {Dongkwon Lee and Sang Wook Choi and Myengsoo Kim and Jin Hyun Park
	and Moonkyu Kim and Jungchul Kim and In-Beum Lee},
  title = {Discovery of differentially expressed genes related to histological
	subtype of hepatocellular carcinoma.},
  journal = {Biotechnol {P}rog.},
  year = {2003},
  volume = {19},
  pages = {1011-5},
  number = {3},
  abstract = {Hepatocellular carcinoma ({HCC}) is one of the most common human malignancies
	in the world. {T}o identify the histological subtype-specific genes
	of {HCC}, we analyzed the gene expression profile of 10 {HCC} patients
	by means of c{DNA} microarray. {W}e proposed a systematic approach
	for determining the discriminatory genes and revealing the biological
	phenomena of {HCC} with c{DNA} microarray data. {F}irst, normalization
	of c{DNA} microarray data was performed to reduce or minimize systematic
	variations. {O}n the basis of the suitably normalized data, we identified
	specific genes involved in histological subtype of {HCC}. {T}wo classification
	methods, {F}isher's discriminant analysis ({FDA}) and support vector
	machine ({SVM}), were used to evaluate the reliability of the selected
	genes and discriminate the histological subtypes of {HCC}. {T}his
	study may provide a clue for the needs of different chemotherapy
	and the reason for heterogeneity of the clinical responses according
	to histological subtypes.},
  doi = {10.1021/bp025746a},
  pdf = {../local/Lee2003Discovery.pdf},
  file = {Lee2003Discovery.pdf:local/Lee2003Discovery.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/bp025746a}
}
@article{Lee2003Classification,
  author = {Lee, Y. and Lee, C.-K.},
  title = {Classification of multiple cancer types by multicategory support
	vector machines using gene expression data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1132-1139},
  number = {9},
  abstract = {Motivation: {H}igh-density {DNA} microarray measures the activities
	of several thousand genes simultaneously and the gene expression
	profiles have been used for the cancer classification recently. {T}his
	new approach promises to give better therapeutic measurements to
	cancer patients by diagnosing cancer types with improved accuracy.
	{T}he {S}upport {V}ector {M}achine ({SVM}) is one of the classification
	methods successfully applied to the cancer diagnosis problems. {H}owever,
	its optimal extension to more than two classes was not obvious, which
	might impose limitations in its application to multiple tumor types.
	{W}e briefly introduce the {M}ulticategory {SVM}, which is a recently
	proposed extension of the binary {SVM}, and apply it to multiclass
	cancer diagnosis problems {R}esults: {I}ts applicability is demonstrated
	on the leukemia data ({G}olub et al., 1999) and the small round blue
	cell tumors of childhood data ({K}han et al., 2001). {C}omparable
	classification accuracy shown in the applications and its flexibility
	render the {MSVM} a viable alternative to other classification methods
	{S}upplementary {I}nformation: http://www.stat.ohio-state.edu/~yklee/msvm.html
	{C}ontact: yklee@stat.ohio-state.edu},
  pdf = {../local/Lee2003Classification.pdf},
  file = {Lee2003Classification.pdf:local/Lee2003Classification.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/9/1132}
}
@inproceedings{Leslie2002spectrum,
  author = {Leslie, C. and Eskin, E. and Noble, W.S.},
  title = {The spectrum kernel: a string kernel for {SVM} protein classification},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002},
  year = {2002},
  editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin
	Lauerdale and Teri E. Klein},
  pages = {564--575},
  address = {Singapore},
  publisher = {World Scientific},
  pdf = {../local/lesl02.pdf},
  file = {lesl02.pdf:local/lesl02.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel}
}
@inproceedings{Leslie2003Mismatch,
  author = {Leslie, C. and Eskin, E. and Weston, J. and Noble, W.S.},
  title = {Mismatch {S}tring {K}ernels for {SVM} {P}rotein {C}lassification},
  booktitle = {Advances in {N}eural {I}nformation {P}rocessing {S}ystems 15},
  year = {2003},
  editor = {Suzanna Becker and Sebastian Thrun and Klaus Obermayer},
  publisher = {MIT Press},
  pdf = {../local/lesl02b.pdf},
  file = {lesl02b.pdf:local/lesl02b.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.cs.columbia.edu/~cleslie/papers/mismatch-short.pdf}
}
@article{Leslie2004Mismatch,
  author = {Leslie, C. S. and Eskin, E. and Cohen, A. and Weston, J. and Noble,
	W. S.},
  title = {Mismatch string kernels for discriminative protein classification},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {467-476},
  number = {4},
  abstract = {Motivation: {C}lassification of proteins sequences into functional
	and structural families based on sequence homology is a central problem
	in computational biology. {D}iscriminative supervised machine learning
	approaches provide good performance, but simplicity and computational
	efficiency of training and prediction are also important concerns.
	{R}esults: {W}e introduce a class of string kernels, called mismatch
	kernels, for use with support vector machines ({SVM}s) in a discriminative
	approach to the problem of protein classification and remote homology
	detection. {T}hese kernels measure sequence similarity based on shared
	occurrences of fixed-length patterns in the data, allowing for mutations
	between patterns. {T}hus, the kernels provide a biologically well-motivated
	way to compare protein sequences without relying on family-based
	generative models such as hidden {M}arkov models. {W}e compute the
	kernels efficiently using a mismatch tree data structure, allowing
	us to calculate the contributions of all patterns occurring in the
	data in one pass while traversing the tree. {W}hen used with an {SVM},
	the kernels enable fast prediction on test sequences. {W}e report
	experiments on two benchmark {SCOP} datasets, where we show that
	the mismatch kernel used with an {SVM} classifier performs competitively
	with state-of-the-art methods for homology detection, particularly
	when very few training examples are available. {E}xamination of the
	highest-weighted patterns learned by the {SVM} classifier recovers
	biologically important motifs in protein families and superfamilies.
	{A}vailability: {SVM} software is publicly available at http://microarray.cpmc.columbia.edu/gist.
	{M}ismatch kernel software is available upon request.},
  pdf = {../local/Leslie2004Mismatch.pdf},
  file = {Leslie2004Mismatch.pdf:local/Leslie2004Mismatch.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/4/467}
}
@article{Lett2004Interaction,
  author = {Lett, D. and Hsing, M. and Pio, F.},
  title = {Interaction profile-based protein classification of death domain},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {75},
  abstract = {Background {T}he increasing number of protein sequences and 3{D} structure
	obtained from genomic initiatives is leading many of us to focus
	on proteomics, and to dedicate our experimental and computational
	efforts on the creation and analysis of information derived from
	3{D} structure. {I}n particular, the high-throughput generation of
	protein-protein interaction data from a few organisms makes such
	an approach very important towards understanding the molecular recognition
	that make-up the entire protein-protein interaction network. {S}ince
	the generation of sequences, and experimental protein-protein interactions
	increases faster than the 3{D} structure determination of protein
	complexes, there is tremendous interest in developing in silico methods
	that generate such structure for prediction and classification purposes.
	{I}n this study we focused on classifying protein family members
	based on their protein-protein interaction distinctiveness. {S}tructure-based
	classification of protein-protein interfaces has been described initially
	by {P}onstingl et al. [1] and more recently by {V}aldar et al. [2]
	and {M}intseris et al. [3], from complex structures that have been
	solved experimentally. {H}owever, little has been done on protein
	classification based on the prediction of protein-protein complexes
	obtained from homology modeling and docking simulation. {R}esults
	{W}e have developed an in silico classification system entitled {HODOCO}
	({H}omology modeling, {D}ocking and {C}lassification {O}racle), in
	which protein {R}esidue {P}otential {I}nteraction {P}rofiles ({RPIPS})
	are used to summarize protein-protein interaction characteristics.
	{T}his system applied to a dataset of 64 proteins of the death domain
	superfamily was used to classify each member into its proper subfamily.
	{T}wo classification methods were attempted, heuristic and support
	vector machine learning. {B}oth methods were tested with a 5-fold
	cross-validation. {T}he heuristic approach yielded a 61% average
	accuracy, while the machine learning approach yielded an 89% average
	accuracy. {C}onclusion {W}e have confirmed the reliability and potential
	value of classifying proteins via their predicted interactions. {O}ur
	results are in the same range of accuracy as other studies that classify
	protein-protein interactions from 3{D} complex structure obtained
	experimentally. {W}hile our classification scheme does not take directly
	into account sequence information our results are in agreement with
	functional and sequence based classification of death domain family
	members.},
  doi = {10.1186/1471-2105-5-75},
  pdf = {../local/Lett2004Interaction.pdf},
  file = {Lett2004Interaction.pdf:local/Lett2004Interaction.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/5/75}
}
@article{Li2005Prediction,
  author = {H. Li and C. Ung and C. Yap and Y. Xue and Z. Li and Z. Cao and Y.
	Chen},
  title = {Prediction of genotoxicity of chemical compounds by statistical learning
	methods.},
  journal = {Chem. {R}es. {T}oxicol.},
  year = {2005},
  volume = {18},
  pages = {1071-1080},
  number = {6},
  month = {Jun},
  abstract = {Various toxicological profiles, such as genotoxic potential, need
	to be studied in drug discovery processes and submitted to the drug
	regulatory authorities for drug safety evaluation. {A}s part of the
	effort for developing low cost and efficient adverse drug reaction
	testing tools, several statistical learning methods have been used
	for developing genotoxicity prediction systems with an accuracy of
	up to 73.8\% for genotoxic ({GT}+) and 92.8\% for nongenotoxic ({GT}-)
	agents. {T}hese systems have been developed and tested by using less
	than 400 known {GT}+ and {GT}- agents, which is significantly less
	in number and diversity than the 860 {GT}+ and {GT}- agents known
	at present. {T}here is a need to examine if a similar level of accuracy
	can be achieved for the more diverse set of molecules and to evaluate
	other statistical learning methods not yet applied to genotoxicity
	prediction. {T}his work is intended for testing several statistical
	learning methods by using 860 {GT}+ and {GT}- agents, which include
	support vector machines ({SVM}), probabilistic neural network ({PNN}),
	k-nearest neighbor (k-{NN}), and {C}4.5 decision tree ({DT}). {A}
	feature selection method, recursive feature elimination, is used
	for selecting molecular descriptors relevant to genotoxicity study.
	{T}he overall accuracies of {SVM}, k-{NN}, and {PNN} are comparable
	to and those of {DT} lower than the results from earlier studies,
	with {SVM} giving the highest accuracies of 77.8\% for {GT}+ and
	92.7\% for {GT}- agents. {O}ur study suggests that statistical learning
	methods, particularly {SVM}, k-{NN}, and {PNN}, are useful for facilitating
	the prediction of genotoxic potential of a diverse set of molecules.},
  doi = {10.1021/tx049652h},
  pdf = {../local/Li2005Prediction.pdf},
  file = {Li2005Prediction.pdf:local/Li2005Prediction.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/tx049652h}
}
@article{Li2003Simple,
  author = {Jinyan Li and Huiqing Liu and James R Downing and Allen Eng-Juh Yeoh
	and Limsoon Wong},
  title = {Simple rules underlying gene expression profiles of more than six
	subtypes of acute lymphoblastic leukemia ({ALL}) patients.},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {71-8},
  number = {1},
  month = {Jan},
  abstract = {M{OTIVATIONS} {AND} {RESULTS}: {F}or classifying gene expression profiles
	or other types of medical data, simple rules are preferable to non-linear
	distance or kernel functions. {T}his is because rules may help us
	understand more about the application in addition to performing an
	accurate classification. {I}n this paper, we discover novel rules
	that describe the gene expression profiles of more than six subtypes
	of acute lymphoblastic leukemia ({ALL}) patients. {W}e also introduce
	a new classifier, named {PCL}, to make effective use of the rules.
	{PCL} is accurate and can handle multiple parallel classifications.
	{W}e evaluate this method by classifying 327 heterogeneous {ALL}
	samples. {O}ur test error rate is competitive to that of support
	vector machines, and it is 71\% better than {C}4.5, 50\% better than
	{N}aive {B}ayes, and 43\% better than k-nearest neighbour. {E}xperimental
	results on another independent data sets are also presented to show
	the strength of our method. {AVAILABILITY}: {U}nder http://sdmc.lit.org.sg/{GED}atasets/,
	click on {S}upplementary {I}nformation.},
  keywords = {Acute, Algorithms, Automated, Base Pair Mismatch, Base Pairing, Base
	Sequence, Biological, Biosensing Techniques, Cluster Analysis, Comparative
	Study, Computer-Assisted, DNA, Gene Expression Profiling, Gene Expression
	Regulation, Genes, Genetic, Genetic Markers, Hemolysins, Humans,
	Leukemia, Lymphocytic, Markov Chains, Messenger, Models, Molecular
	Probe Techniques, Molecular Sequence Data, Nanotechnology, Neoplasm,
	Neoplastic, Neural Networks (Computer), Non-U.S. Gov't, Nucleic Acid
	Conformation, Oligonucleotide Array Sequence Analysis, Pattern Recognition,
	Quality Control, RNA, Research Support, Signal Processing, Statistical,
	Stomach Neoplasms, Tumor Markers, 12499295}
}
@article{Li2005robust,
  author = {Li, L. and Jiang, W. and Li, X. and Moser, K.L. and Guo, Z. and Du,
	L. and Wang, Q. and Topol, E.J. and Wang, Q. and Rao, S.},
  title = {A robust hybrid between genetic algorithm and support vector machine
	for extracting an optimal feature gene subset},
  journal = {Genomics},
  year = {2005},
  volume = {85},
  pages = {16-23},
  number = {1},
  abstract = {Development of a robust and efficient approach for extracting useful
	information from microarray data continues to be a significant and
	challenging task. {M}icroarray data are characterized by a high dimension,
	high signal-to-noise ratio, and high correlations between genes,
	but with a relatively small sample size. {C}urrent methods for dimensional
	reduction can further be improved for the scenario of the presence
	of a single (or a few) high influential gene(s) in which its effect
	in the feature subset would prohibit inclusion of other important
	genes. {W}e have formalized a robust gene selection approach based
	on a hybrid between genetic algorithm and support vector machine.
	{T}he major goal of this hybridization was to exploit fully their
	respective merits (e.g., robustness to the size of solution space
	and capability of handling a very large dimension of feature genes)
	for identification of key feature genes (or molecular signatures)
	for a complex biological phenotype. {W}e have applied the approach
	to the microarray data of diffuse large {B} cell lymphoma to demonstrate
	its behaviors and properties for mining the high-dimension data of
	genome-wide gene expression profiles. {T}he resulting classifier(s)
	(the optimal gene subset(s)) has achieved the highest accuracy (99%)
	for prediction of independent microarray samples in comparisons with
	marginal filters and a hybrid between genetic algorithm and {K} nearest
	neighbors.},
  doi = {10.1016/j.ygeno.2004.09.007},
  pdf = {../local/Li2005robust.pdf},
  file = {Li2005robust.pdf:local/Li2005robust.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.ygeno.2004.09.007}
}
@article{Li2004Data,
  author = {Li, L. and Tang, H. and Wu, Z. and Gong, J. and Gruidl, M. and Zou,
	J. and Tockman, M. and Clark, R.A.},
  title = {Data mining techniques for cancer detection using serum proteomic
	profiling.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2004},
  volume = {32},
  pages = {71-83},
  number = {2},
  abstract = {O{BJECTIVE}: {P}athological changes in an organ or tissue may be reflected
	in proteomic patterns in serum. {I}t is possible that unique serum
	proteomic patterns could be used to discriminate cancer samples from
	non-cancer ones. {D}ue to the complexity of proteomic profiling,
	a higher order analysis such as data mining is needed to uncover
	the differences in complex proteomic patterns. {T}he objectives of
	this paper are (1) to briefly review the application of data mining
	techniques in proteomics for cancer detection/diagnosis; (2) to explore
	a novel analytic method with different feature selection methods;
	(3) to compare the results obtained on different datasets and that
	reported by {P}etricoin et al. in terms of detection performance
	and selected proteomic patterns. {METHODS} {AND} {MATERIAL}: {T}hree
	serum {SELDI} {MS} data sets were used in this research to identify
	serum proteomic patterns that distinguish the serum of ovarian cancer
	cases from non-cancer controls. {A} support vector machine-based
	method is applied in this study, in which statistical testing and
	genetic algorithm-based methods are used for feature selection respectively.
	{L}eave-one-out cross validation with receiver operating characteristic
	({ROC}) curve is used for evaluation and comparison of cancer detection
	performance. {RESULTS} {AND} {CONCLUSIONS}: {T}he results showed
	that (1) data mining techniques can be successfully applied to ovarian
	cancer detection with a reasonably high performance; (2) the classification
	using features selected by the genetic algorithm consistently outperformed
	those selected by statistical testing in terms of accuracy and robustness;
	(3) the discriminatory features (proteomic patterns) can be very
	different from one selection method to another. {I}n other words,
	the pattern selection and its classification efficiency are highly
	classifier dependent. {T}herefore, when using data mining techniques,
	the discrimination of cancer from normal does not depend solely upon
	the identity and origination of cancer-related proteins.},
  doi = {10.1016/j.artmed.2004.03.006},
  pdf = {../local/Li2004Data.pdf},
  file = {Li2004Data.pdf:local/Li2004Data.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.artmed.2004.03.006}
}
@article{Li2004comparative,
  author = {Li, T. and Zhang, C. and Ogihara, M.},
  title = {A comparative study of feature selection and multiclass classification
	methods for tissue classification based on gene expression},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {2429-2437},
  number = {15},
  abstract = {Summary: {T}his paper studies the problem of building multiclass classifiers
	for tissue classification based on gene expression. {T}he recent
	development of microarray technologies has enabled biologists to
	quantify gene expression of tens of thousands of genes in a single
	experiment. {B}iologists have begun collecting gene expression for
	a large number of samples. {O}ne of the urgent issues in the use
	of microarray data is to develop methods for characterizing samples
	based on their gene expression. {T}he most basic step in the research
	direction is binary sample classification, which has been studied
	extensively over the past few years. {T}his paper investigates the
	next step--multiclass classification of samples based on gene expression.
	{T}he characteristics of expression data (e.g. large number of genes
	with small sample size) makes the classification problem more challenging.
	{T}he process of building multiclass classifiers is divided into
	two components: (i) selection of the features (i.e. genes) to be
	used for training and testing and (ii) selection of the classification
	method. {T}his paper compares various feature selection methods as
	well as various state-of-the-art classification methods on various
	multiclass gene expression datasets. {O}ur study indicates that multiclass
	classification problem is much more difficult than the binary one
	for the gene expression datasets. {T}he difficulty lies in the fact
	that the data are of high dimensionality and that the sample size
	is small. {T}he classification accuracy appears to degrade very rapidly
	as the number of classes increases. {I}n particular, the accuracy
	was very low regardless of the choices of the methods for large-class
	datasets (e.g. {NCI}60 and {GCM}). {W}hile increasing the number
	of samples is a plausible solution to the problem of accuracy degradation,
	it is important to develop algorithms that are able to analyze effectively
	multiple-class expression data for these special datasets.},
  pdf = {../local/Li2004comparative.pdf},
  file = {Li2004comparative.pdf:local/Li2004comparative.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/15/2429}
}
@article{Liang2001Detection,
  author = {H. Liang and Z. Lin},
  title = {Detection of delayed gastric emptying from electrogastrograms with
	support vector machine.},
  journal = {I{EEE} {T}rans {B}iomed {E}ng},
  year = {2001},
  volume = {48},
  pages = {601-4},
  number = {5},
  month = {May},
  abstract = {A recent study reported a conventional neural network ({NN}) approach
	for the noninvasive diagnosis of delayed gastric emptying from the
	cutaneous electrogastrograms. {U}sing support vector machine, we
	show that this relatively new technique can be used for detection
	of delayed gastric emptying and is in fact able to outdo the conventional
	{NN}.},
  keywords = {Algorithms, Amino Acid Sequence, Artificial Intelligence, Biological,
	Cell Compartmentation, Comparative Study, Computer Simulation, Computer-Assisted,
	Decision Trees, Diagnosis, Discriminant Analysis, Electrophysiology,
	Gastric Emptying, Humans, Logistic Models, Melanoma, Models, Neural
	Networks (Computer), Nevus, Non-U.S. Gov't, Organelles, P.H.S., Pigmented,
	Predictive Value of Tests, Proteins, Reproducibility of Results,
	Research Support, Skin Diseases, Skin Neoplasms, Skin Pigmentation,
	Stomach Diseases, U.S. Gov't, 11341535}
}
@article{Liao2003Combining,
  author = {Liao, L. and Noble, W.S.},
  title = {Combining {P}airwise {S}equence {S}imilarity and {S}upport {V}ector
	{M}achines for {D}etecting {R}emote {P}rotein {E}volutionary and
	{S}tructural {R}elationships},
  journal = {J. {C}omput. {B}iol.},
  year = {2003},
  volume = {10},
  pages = {857-868},
  number = {6},
  abstract = {One key element in understanding the molecular machinery of the cell
	is to understand the structure and function of each protein encoded
	in the genome. {A} very successful means of inferring the structure
	or function of a previously unannotated protein is via sequence similarity
	with one or more proteins whose structure or function is already
	known. {T}oward this end, we propose a means of representing proteins
	using pairwise sequence similarity scores. {T}his representation,
	combined with a discriminative classification algorithm known as
	the support vector machine ({SVM}), provides a powerful means of
	detecting subtle structural and evolutionary relationships among
	proteins. {T}he algorithm, called {SVM}-pairwise, when tested on
	its ability to recognize previously unseen families from the {SCOP}
	database, yields significantly better performance than {SVM}-{F}isher,
	profile {HMM}s, and {PSI}-{BLAST}.},
  pdf = {../local/Liao2003Combining.pdf},
  file = {Liao2003Combining.pdf:local/Liao2003Combining.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.liebertonline.com/doi/abs/10.1089/106652703322756113}
}
@inproceedings{Liao2002Combining,
  author = {Liao, L. and Noble, W. S.},
  title = {Combining pairwise sequence similarity and support vector machines
	for remote protein homology detection},
  booktitle = {Proceedings of the {S}ixth {I}nternational {C}onference on {C}omputational
	{M}olecular {B}iology},
  year = {2002},
  pdf = {../local/liao02.pdf},
  file = {liao02.pdf:local/liao02.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernelcasp},
  url = {http://www.cs.columbia.edu/~bgrundy/papers/fps-svm.html}
}
@article{Liberles2002use,
  author = {Liberles, D. A. and Thor{\'e}n, A. and von Heijne, G. and Elofsson,
	A.},
  title = {The use of phylogenetic profiles for gene predictions},
  journal = {Curr. {G}enom.},
  year = {2002},
  note = {To appear},
  pdf = {../local/libe02.pdf},
  file = {libe02.pdf:local/libe02.pdf:PDF},
  subject = {bio},
  url = {http://www.sbc.su.se/~arne/papers/phylo.pdf}
}
@article{Lievens2009Mammalian,
  author = {Sam Lievens and Irma Lemmens and Jan Tavernier},
  title = {Mammalian two-hybrids come of age.},
  journal = {Trends Biochem Sci},
  year = {2009},
  volume = {34},
  pages = {579--588},
  number = {11},
  month = {Nov},
  abstract = {A diverse series of mammalian two-hybrid technologies for the detection
	of protein-protein interactions have emerged in the past few years,
	complementing the established yeast two-hybrid approach. Given the
	mammalian background in which they operate, these assays open new
	avenues to study the dynamics of mammalian protein interaction networks,
	i.e. the temporal, spatial and functional modulation of protein-protein
	associations. In addition, novel assay formats are available that
	enable high-throughput mammalian two-hybrid applications, facilitating
	their use in large-scale interactome mapping projects. Finally, as
	they can be applied in drug discovery and development programs, these
	techniques also offer exciting new opportunities for biomedical research.},
  doi = {10.1016/j.tibs.2009.06.009},
  institution = {Department of Medical Protein Research, VIB, A. Baertsoenkaai 3,
	9000 Ghent, Belgium},
  keywords = {Animals; Genes, Reporter; Humans; Models, Biological; Protein Binding;
	Protein Interaction Mapping; Proteins; Recombinant Fusion Proteins;
	Transfection; Two-Hybrid System Techniques},
  owner = {phupe},
  pii = {S0968-0004(09)00158-3},
  pmid = {19786350},
  timestamp = {2010.08.31},
  url = {http://dx.doi.org/10.1016/j.tibs.2009.06.009}
}
@article{Lima-Mendez2009powerful,
  author = {Lima-Mendez, G. and van Helden, J.},
  title = {The powerful law of the power law and other myths in network biology.},
  journal = {Mol Biosyst},
  year = {2009},
  volume = {5},
  pages = {1482--1493},
  number = {12},
  month = {Dec},
  abstract = {For almost 10 years, topological analysis of different large-scale
	biological networks (metabolic reactions, protein interactions, transcriptional
	regulation) has been highlighting some recurrent properties: power
	law distribution of degree, scale-freeness, small world, which have
	been proposed to confer functional advantages such as robustness
	to environmental changes and tolerance to random mutations. Stochastic
	generative models inspired different scenarios to explain the growth
	of interaction networks during evolution. The power law and the associated
	properties appeared so ubiquitous in complex networks that they were
	qualified as "universal laws". However, these properties are no longer
	observed when the data are subjected to statistical tests: in most
	cases, the data do not fit the expected theoretical models, and the
	cases of good fitting merely result from sampling artefacts or improper
	data representation. The field of network biology seems to be founded
	on a series of myths, i.e. widely believed but false ideas. The weaknesses
	of these foundations should however not be considered as a failure
	for the entire domain. Network analysis provides a powerful frame
	for understanding the function and evolution of biological processes,
	provided it is brought to an appropriate level of description, by
	focussing on smaller functional modules and establishing the link
	between their topological properties and their dynamical behaviour.},
  doi = {10.1039/b908681a},
  institution = {Bioinformatique des Génomes et des Réseaux-BiGRe, Université Libre
	de Bruxelles, Campus Plaine, CP 263, Boulevard du Triomphe, B-1050
	Bruxelles, Belgium. gipsi@bigre.ulb.ac.be},
  keywords = {Computational Biology, methods; Gene Regulatory Networks; Metabolic
	Networks and Pathways; Models, Biological; Semantics; Signal Transduction},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pmid = {20023717},
  timestamp = {2011.04.07},
  url = {http://dx.doi.org/10.1039/b908681a}
}
@article{Lin2002Conserved,
  author = {Lin, K. and Kuang, Y. and Joseph, J. S. and Kolatkar, P. R.},
  title = {Conserved codon composition of ribosomal protein coding genes in
	{E}scherichia coli, {M}ycobacterium tuberculosis and {S}accharomyces
	cerevisiae: lessons from supervised machine learning in functional
	genomics},
  journal = {Nucl. {A}cids {R}es.},
  year = {2002},
  volume = {30},
  pages = {2599-2607},
  number = {11},
  abstract = {Genomics projects have resulted in a flood of sequence data. {F}unctional
	annotation currently relies almost exclusively on inter-species sequence
	comparison and is restricted in cases of limited data from related
	species and widely divergent sequences with no known homologs. {H}ere,
	we demonstrate that codon composition, a fusion of codon usage bias
	and amino acid composition signals, can accurately discriminate,
	in the absence of sequence homology information, cytoplasmic ribosomal
	protein genes from all other genes of known function in {S}accharomyces
	cerevisiae, {E}scherichia coli and {M}ycobacterium tuberculosis using
	an implementation of support vector machines, {SVM}light. {A}nalysis
	of these codon composition signals is instructive in determining
	features that confer individuality to ribosomal protein genes. {E}ach
	of the sets of positively charged, negatively charged and small hydrophobic
	residues, as well as codon bias, contribute to their distinctive
	codon composition profile. {T}he representation of all these signals
	is sensitively detected, combined and augmented by the {SVM}s to
	perform an accurate classification. {O}f special mention is an obvious
	outlier, yeast gene {RPL}22{B}, highly homologous to {RPL}22{A} but
	employing very different codon usage, perhaps indicating a non-ribosomal
	function. {F}inally, we propose that codon composition be used in
	combination with other attributes in gene/protein classification
	by supervised machine learning algorithms.},
  pdf = {../local/Lin2002Conserved.pdf},
  file = {Lin2002Conserved.pdf:local/Lin2002Conserved.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://nar.oupjournals.org/cgi/content/abstract/30/11/2599}
}
@article{Lind2003Support,
  author = {P. Lind and T. Maltseva},
  title = {Support vector machines for the estimation of aqueous solubility.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {1855-9},
  number = {6},
  abstract = {Support {V}ector {M}achines ({SVM}s) are used to estimate aqueous
	solubility of organic compounds. {A} {SVM} equipped with a {T}animoto
	similarity kernel estimates solubility with accuracy comparable to
	results from other reported methods where the same data sets have
	been studied. {C}omplete cross-validation on a diverse data set resulted
	in a root-mean-squared error = 0.62 and {R}(2) = 0.88. {T}he data
	input to the machine is in the form of molecular fingerprints. {N}o
	physical parameters are explicitly involved in calculations.},
  doi = {10.1021/ci034107s},
  pdf = {../local/Lind2003Support.pdf},
  file = {Lind2003Support.pdf:local/Lind2003Support.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci034107s}
}
@article{Listgarten2004Predictive,
  author = {Listgarten, J. and Damaraju, S. and Poulin, B. and Cook, L. and Dufour,
	J. and Driga, A. and Mackey, J. and Wishart, D. and Greiner, R. and
	Zanke, B.},
  title = {Predictive {M}odels for {B}reast {C}ancer {S}usceptibility from {M}ultiple
	{S}ingle {N}ucleotide {P}olymorphisms},
  journal = {Clin. {C}ancer {R}es.},
  year = {2004},
  volume = {10},
  pages = {2725-2737},
  number = {8},
  abstract = {Hereditary predisposition and causative environmental exposures have
	long been recognized in human malignancies. {I}n most instances,
	cancer cases occur sporadically, suggesting that environmental influences
	are critical in determining cancer risk. {T}o test the influence
	of genetic polymorphisms on breast cancer risk, we have measured
	98 single nucleotide polymorphisms ({SNP}s) distributed over 45 genes
	of potential relevance to breast cancer etiology in 174 patients
	and have compared these with matched normal controls. {U}sing machine
	learning techniques such as support vector machines ({SVM}s), decision
	trees, and naive {B}ayes, we identified a subset of three {SNP}s
	as key discriminators between breast cancer and controls. {T}he {SVM}s
	performed maximally among predictive models, achieving 69% predictive
	power in distinguishing between the two groups, compared with a 50%
	baseline predictive power obtained from the data after repeated random
	permutation of class labels (individuals with cancer or controls).
	{H}owever, the simpler naive {B}ayes model as well as the decision
	tree model performed quite similarly to the {SVM}. {T}he three {SNP}
	sites most useful in this model were (a) the +4536{T}/{C} site of
	the aldosterone synthase gene {CYP}11{B}2 at amino acid residue 386
	{V}al/{A}la ({T}/{C}) (rs4541); (b) the +4328{C}/{G} site of the
	aryl hydrocarbon hydroxylase {CYP}1{B}1 at amino acid residue 293
	{L}eu/{V}al ({C}/{G}) (rs5292); and (c) the +4449{C}/{T} site of
	the transcription factor {BCL}6 at amino acid 387 {A}sp/{A}sp (rs1056932).
	{N}o single {SNP} site on its own could achieve more than 60% in
	predictive accuracy. {W}e have shown that multiple {SNP} sites from
	different genes over distant parts of the genome are better at identifying
	breast cancer patients than any one {SNP} alone. {A}s high-throughput
	technology for {SNP}s improves and as more {SNP}s are identified,
	it is likely that much higher predictive accuracy will be achieved
	and a useful clinical tool developed.},
  eprint = {http://clincancerres.aacrjournals.org/cgi/reprint/10/8/2725.pdf},
  pdf = {../local/Listgarten2004Predictive.pdf},
  file = {Listgarten2004Predictive.pdf:local/Listgarten2004Predictive.pdf:PDF},
  keywords = {biosvm, breastcancer},
  owner = {jeanphilippevert},
  url = {http://clincancerres.aacrjournals.org/cgi/content/abstract/10/8/2725}
}
@article{Liu2004Using,
  author = {Huiqing Liu and Hao Han and Jinyan Li and Limsoon Wong},
  title = {Using amino acid patterns to accurately predict translation initiation
	sites.},
  journal = {In {S}ilico {B}iol.},
  year = {2004},
  volume = {4},
  pages = {255-69},
  number = {3},
  abstract = {The translation initiation site ({TIS}) prediction problem is about
	how to correctly identify {TIS} in m{RNA}, c{DNA}, or other types
	of genomic sequences. {H}igh prediction accuracy can be helpful in
	a better understanding of protein coding from nucleotide sequences.
	{T}his is an important step in genomic analysis to determine protein
	coding from nucleotide sequences. {I}n this paper, we present an
	in silico method to predict translation initiation sites in vertebrate
	c{DNA} or m{RNA} sequences. {T}his method consists of three sequential
	steps as follows. {I}n the first step, candidate features are generated
	using k-gram amino acid patterns. {I}n the second step, a small number
	of top-ranked features are selected by an entropy-based algorithm.
	{I}n the third step, a classification model is built to recognize
	true {TIS}s by applying support vector machines or ensembles of decision
	trees to the selected features. {W}e have tested our method on several
	independent data sets, including two public ones and our own extracted
	sequences. {T}he experimental results achieved are better than those
	reported previously using the same data sets. {O}ur high accuracy
	not only demonstrates the feasibility of our method, but also indicates
	that there might be "amino acid" patterns around {TIS} in c{DNA}
	and m{RNA} sequences.},
  keywords = {biosvm},
  pii = {2004040022},
  url = {http://www.bioinfo.de/isb/2004/04/0022/}
}
@article{Liu2003in-silico,
  author = {Huiqing Liu and Hao Han and Jinyan Li and Limsoon Wong},
  title = {An in-silico method for prediction of polyadenylation signals in
	human sequences.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2003},
  volume = {14},
  pages = {84-93},
  abstract = {This paper presents a machine learning method to predict polyadenylation
	signals ({PAS}es) in human {DNA} and m{RNA} sequences by analysing
	features around them. {T}his method consists of three sequential
	steps of feature manipulation: generation, selection and integration
	of features. {I}n the first step, new features are generated using
	k-gram nucleotide acid or amino acid patterns. {I}n the second step,
	a number of important features are selected by an entropy-based algorithm.
	{I}n the third step, support vector machines are employed to recognize
	true {PAS}es from a large number of candidates. {O}ur study shows
	that true {PAS}es in {DNA} and m{RNA} sequences can be characterized
	by different features, and also shows that both upstream and downstream
	sequence elements are important for recognizing {PAS}es from {DNA}
	sequences. {W}e tested our method on several public data sets as
	well as our own extracted data sets. {I}n most cases, we achieved
	better validation results than those reported previously on the same
	data sets. {T}he important motifs observed are highly consistent
	with those reported in literature.},
  keywords = {biosvm}
}
@article{Liu2005Use,
  author = {Huiqing Liu and Jinyan Li and Limsoon Wong},
  title = {Use of extreme patient samples for outcome prediction from gene expression
	data.},
  journal = {Bioinformatics},
  year = {2005},
  month = {Jun},
  abstract = {M{OTIVATION}: {P}atient outcome prediction using microarray technologies
	is an important application in bioinformatics. {B}ased on patients'
	genotypic microarray data, predictions are made to estimate patients'
	survival time and their risk of tumor metastasis or recurrence. {S}o,
	accurate prediction can potentially help to provide better treatment
	for patients. {RESULTS}: {W}e present a new computational method
	for patient outcome prediction. {I}n the training phase of this method,
	we make use of two types of extreme patient samples: short-term survivors
	who got an unfavorable outcome within a short period and long-term
	survivors who were maintaining a favorable outcome after a long follow-up
	time. {T}hese extreme training samples yield a clear platform for
	us to identify relevant genes whose expression is closely related
	to the outcome. {T}he selected extreme samples and the relevant genes
	are then integrated by a support vector machine to build a prediction
	model, by which each validation sample is assigned a risk score that
	falls into one of special pre-defined risk groups. {W}e apply this
	method to several public data sets. {I}n most cases, patients in
	high and low risk groups stratified by our method have clearly distinguishable
	outcome status as seen in their {K}aplan-{M}eier curves. {W}e also
	show that the idea of selecting only extreme patient samples for
	training is effective for improving the prediction accuracy when
	different gene selection methods are used. {SUPPLEMENTARY} {INFORMATION}:
	http://research.i2r.a-star.edu.sg/huiqing/supplementaldata/survival/survival.html.},
  doi = {10.1093/bioinformatics/bti544},
  pdf = {../local/Liu2005Use.pdf},
  file = {Liu2005Use.pdf:local/Liu2005Use.pdf:PDF},
  keywords = {biosvm},
  pii = {bti544},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti544}
}
@article{Liu2004Quantitative,
  author = {H. X. Liu and C. X. Xue and R. S. Zhang and X. J. Yao and M. C. Liu
	and Z. D. Hu and B. T. Fan},
  title = {Quantitative prediction of logk of peptides in high-performance liquid
	chromatography based on molecular descriptors by using the heuristic
	method and support vector machine.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1979-86},
  number = {6},
  abstract = {A new method support vector machine ({SVM}) and the heuristic method
	({HM}) were used to develop the nonlinear and linear models between
	the capacity factor (logk) and seven molecular descriptors of 75
	peptides for the first time. {T}he molecular descriptors representing
	the structural features of the compounds only included the constitutional
	and topological descriptors, which can be obtained easily without
	optimizing the structure of the molecule. {T}he seven molecular descriptors
	selected by the heuristic method in {CODESSA} were used as inputs
	for {SVM}. {T}he results obtained by {SVM} were compared with those
	obtained by the heuristic method. {T}he prediction result of the
	{SVM} model is better than that of heuristic method. {F}or the test
	set, a predictive correlation coefficient {R} = 0.9801 and root-mean-square
	error of 0.1523 were obtained. {T}he prediction results are in very
	good agreement with the experimental values. {B}ut the linear model
	of the heuristic method is easier to understand and ready to use
	for a chemist. {T}his paper provided a new and effective method for
	predicting the chromatography retention of peptides and some insight
	into the structural features which are related to the capacity factor
	of peptides.},
  doi = {10.1021/ci049891a},
  pdf = {../local/Liu2004Quantitative.pdf},
  file = {Liu2004Quantitative.pdf:local/Liu2004Quantitative.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049891a}
}
@article{Liu2004Prediction,
  author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu
	and B. T. Fan},
  title = {Prediction of the isoelectric point of an amino acid based on {GA}-{PLS}
	and {SVM}s.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {161-7},
  number = {1},
  abstract = {The support vector machine ({SVM}), as a novel type of a learning
	machine, for the first time, was used to develop a {QSPR} model that
	relates the structures of 35 amino acids to their isoelectric point.
	{M}olecular descriptors calculated from the structure alone were
	used to represent molecular structures. {T}he seven descriptors selected
	using {GA}-{PLS}, which is a sophisticated hybrid approach that combines
	{GA} as a powerful optimization method with {PLS} as a robust statistical
	method for variable selection, were used as inputs of {RBFNN}s and
	{SVM} to predict the isoelectric point of an amino acid. {T}he optimal
	{QSPR} model developed was based on support vector machines, which
	showed the following results: the root-mean-square error of 0.2383
	and the prediction correlation coefficient {R}=0.9702 were obtained
	for the whole data set. {S}atisfactory results indicated that the
	{GA}-{PLS} approach is a very effective method for variable selection,
	and the support vector machine is a very promising tool for the nonlinear
	approximation.},
  doi = {10.1021/ci034173u},
  pdf = {../local/Liu2004Prediction.pdf},
  file = {Liu2004Prediction.pdf:local/Liu2004Prediction.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci034173u}
}
@article{Liu2004QSAR,
  author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu
	and B. T. Fan},
  title = {Q{SAR} and classification models of a novel series of {COX}-2 selective
	inhibitors: 1,5-diarylimidazoles based on support vector machines.},
  journal = {J {C}omput {A}ided {M}ol {D}es},
  year = {2004},
  volume = {18},
  pages = {389-99},
  number = {6},
  month = {Jun},
  abstract = {The support vector machine, which is a novel algorithm from the machine
	learning community, was used to develop quantitation and classification
	models which can be used as a potential screening mechanism for a
	novel series of {COX}-2 selective inhibitors. {E}ach compound was
	represented by calculated structural descriptors that encode constitutional,
	topological, geometrical, electrostatic, and quantum-chemical features.
	{T}he heuristic method was then used to search the descriptor space
	and select the descriptors responsible for activity. {Q}uantitative
	modelling results in a nonlinear, seven-descriptor model based on
	{SVM}s with root mean-square errors of 0.107 and 0.136 for training
	and prediction sets, respectively. {T}he best classification results
	are found using {SVM}s: the accuracy for training and test sets is
	91.2\% and 88.2\%, respectively. {T}his paper proposes a new and
	effective method for drug design and screening.},
  keywords = {biosvm chemoinformatics}
}
@article{Liu2003QSAR,
  author = {H. X. Liu and R. S. Zhang and X. J. Yao and M. C. Liu and Z. D. Hu
	and B. T. Fan},
  title = {Q{SAR} study of ethyl 2-[(3-methyl-2,5-dioxo(3-pyrrolinyl))amino]-4-(trifluoromethyl)
	pyrimidine-5-carboxylate: an inhibitor of {AP}-1 and {NF}-kappa {B}
	mediated gene expression based on support vector machines.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {1288-96},
  number = {4},
  abstract = {The support vector machine, as a novel type of learning machine, for
	the first time, was used to develop a {QSAR} model of 57 analogues
	of ethyl 2-[(3-methyl-2,5-dioxo(3-pyrrolinyl))amino]-4-(trifluoromethyl)pyrimidine-5-carboxylate
	({EPC}), an inhibitor of {AP}-1 and {NF}-kappa {B} mediated gene
	expression, based on calculated quantum chemical parameters. {T}he
	quantum chemical parameters involved in the model are {K}ier and
	{H}all index (order3) ({KHI}3), {I}nformation content (order 0) ({IC}0),
	{YZ} {S}hadow ({YZS}) and {M}ax partial charge for an {N} atom ({M}ax{PCN}),
	{M}in partial charge for an {N} atom ({M}in{PCN}). {T}he mean relative
	error of the training set, the validation set, and the testing set
	is 1.35\%, 1.52\%, and 2.23\%, respectively, and the maximum relative
	error is less than 5.00\%.},
  doi = {10.1021/ci0340355},
  pdf = {../local/Liu2003QSAR.pdf},
  file = {Liu2003QSAR.pdf:local/Liu2003QSAR.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci0340355}
}
@article{Liu2005Multiclass,
  author = {Jane Jijun Liu and Gene Cutler and Wuxiong Li and Zheng Pan and Sihua
	Peng and Tim Hoey and Liangbiao Chen and Xuefeng Bruce Ling},
  title = {Multiclass cancer classification and biomarker discovery using {GA}-based
	algorithms.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2691-7},
  number = {11},
  month = {Jun},
  abstract = {M{OTIVATION}: {T}he development of microarray-based high-throughput
	gene profiling has led to the hope that this technology could provide
	an efficient and accurate means of diagnosing and classifying tumors,
	as well as predicting prognoses and effective treatments. {H}owever,
	the large amount of data generated by microarrays requires effective
	reduction of discriminant gene features into reliable sets of tumor
	biomarkers for such multiclass tumor discrimination. {T}he availability
	of reliable sets of biomarkers, especially serum biomarkers, should
	have a major impact on our understanding and treatment of cancer.
	{RESULTS}: {W}e have combined genetic algorithm ({GA}) and all paired
	({AP}) support vector machine ({SVM}) methods for multiclass cancer
	categorization. {P}redictive features can be automatically determined
	through iterative {GA}/{SVM}, leading to very compact sets of non-redundant
	cancer-relevant genes with the best classification performance reported
	to date. {I}nterestingly, these different classifier sets harbor
	only modest overlapping gene features but have similar levels of
	accuracy in leave-one-out cross-validations ({LOOCV}). {F}urther
	characterization of these optimal tumor discriminant features, including
	the use of nearest shrunken centroids ({NSC}), analysis of annotations
	and literature text mining, reveals previously unappreciated tumor
	subclasses and a series of genes that could be used as cancer biomarkers.
	{W}ith this approach, we believe that microarray-based multiclass
	molecular analysis can be an effective tool for cancer biomarker
	discovery and subsequent molecular cancer diagnosis.},
  doi = {10.1093/bioinformatics/bti419},
  pdf = {../local/Liu2005Multiclass.pdf},
  file = {Liu2005Multiclass.pdf:local/Liu2005Multiclass.pdf:PDF},
  keywords = {biosvm},
  pii = {bti419},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti419}
}
@article{Liu2004Active,
  author = {Liu, Y.},
  title = {Active learning with support vector machine applied to gene expression
	data for cancer classification},
  journal = {J. {C}hem. {I}nf. {C}omput. {S}ci.},
  year = {2004},
  volume = {44},
  pages = {1936-1941},
  number = {6},
  abstract = {There is growing interest in the application of machine learning techniques
	in bioinformatics. {T}he supervised machine learning approach has
	been widely applied to bioinformatics and gained a lot of success
	in this research area. {W}ith this learning approach researchers
	first develop a large training set, which is a time-consuming and
	costly process. {M}oreover, the proportion of the positive examples
	and negative examples in the training set may not represent the real-world
	data distribution, which causes concept drift. {A}ctive learning
	avoids these problems. {U}nlike most conventional learning methods
	where the training set used to derive the model remains static, the
	classifier can actively choose the training data and the size of
	training set increases. {W}e introduced an algorithm for performing
	active learning with support vector machine and applied the algorithm
	to gene expression profiles of colon cancer, lung cancer, and prostate
	cancer samples. {W}e compared the classification performance of active
	learning with that of passive learning. {T}he results showed that
	employing the active learning method can achieve high accuracy and
	significantly reduce the need for labeled training instances. {F}or
	lung cancer classification, to achieve 96% of the total positives,
	only 31 labeled examples were needed in active learning whereas in
	passive learning 174 labeled examples were required. {T}hat meant
	over 82% reduction was realized by active learning. {I}n active learning
	the areas under the receiver operating characteristic ({ROC}) curves
	were over 0.81, while in passive learning the areas under the {ROC}
	curves were below 0.50.},
  doi = {10.1021/ci049810a},
  pdf = {../local/Liu2004Active.pdf},
  file = {Liu2004Active.pdf:local/Liu2004Active.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1021/ci049810a}
}
@article{Liu2004comparative,
  author = {Y. Liu},
  title = {A comparative study on feature selection methods for drug discovery.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1823-8},
  number = {5},
  abstract = {Feature selection is frequently used as a preprocessing step to machine
	learning. {T}he removal of irrelevant and redundant information often
	improves the performance of learning algorithms. {T}his paper is
	a comparative study of feature selection in drug discovery. {T}he
	focus is on aggressive dimensionality reduction. {F}ive methods were
	evaluated, including information gain, mutual information, a chi2-test,
	odds ratio, and {GSS} coefficient. {T}wo well-known classification
	algorithms, {N}aïve {B}ayesian and {S}upport {V}ector {M}achine
	({SVM}), were used to classify the chemical compounds. {T}he results
	showed that {N}aïve {B}ayesian benefited significantly from the
	feature selection, while {SVM} performed better when all features
	were used. {I}n this experiment, information gain and chi2-test were
	most effective feature selection methods. {U}sing information gain
	with a {N}aïve {B}ayesian classifier, removal of up to 96\% of the
	features yielded an improved classification accuracy measured by
	sensitivity. {W}hen information gain was used to select the features,
	{SVM} was much less sensitive to the reduction of feature space.
	{T}he feature set size was reduced by 99\%, while losing only a few
	percent in terms of sensitivity (from 58.7\% to 52.5\%) and specificity
	(from 98.4\% to 97.2\%). {I}n contrast to information gain and chi2-test,
	mutual information had relatively poor performance due to its bias
	toward favoring rare features and its sensitivity to probability
	estimation errors.},
  doi = {10.1021/ci049875d},
  pdf = {../local/Liu2004comparative.pdf},
  file = {Liu2004comparative.pdf:local/Liu2004comparative.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049875d}
}
@article{Liu2005Gene,
  author = {Zhenqiu Liu and Dechang Chen and Halima Bensmail},
  title = {Gene expression data classification with kernel principal component
	analysis.},
  journal = {J {B}iomed {B}iotechnol},
  year = {2005},
  volume = {2005},
  pages = {155-9},
  number = {2},
  abstract = {One important feature of the gene expression data is that the number
	of genes ${M}$ far exceeds the number of samples ${N}$ . {S}tandard
	statistical methods do not work well when ${N} < {M}$ . {D}evelopment
	of new methodologies or modification of existing methodologies is
	needed for the analysis of the microarray data. {I}n this paper,
	we propose a novel analysis procedure for classifying the gene expression
	data. {T}his procedure involves dimension reduction using kernel
	principal component analysis ({KPCA}) and classification with logistic
	regression (discrimination). {KPCA} is a generalization and nonlinear
	version of principal component analysis. {T}he proposed algorithm
	was applied to five different gene expression datasets involving
	human tumor samples. {C}omparison with other popular classification
	methods such as support vector machines and neural networks shows
	that our algorithm is very promising in classifying gene expression
	data.},
  doi = {10.1155/JBB.2005.155},
  pdf = {../local/Liu2005Gene.pdf},
  file = {Liu2005Gene.pdf:local/Liu2005Gene.pdf:PDF},
  keywords = {biosvm},
  pii = {S1110724304406032_THIS_PII_IS_INCORRECT_},
  url = {http://dx.doi.org/10.1155/JBB.2005.155}
}
@article{Lo2005Effect,
  author = {Siaw Ling Lo and Cong Zhong Cai and Yu Zong Chen and Maxey C M Chung},
  title = {Effect of training datasets on support vector machine prediction
	of protein-protein interactions.},
  journal = {Proteomics},
  year = {2005},
  volume = {5},
  pages = {876-84},
  number = {4},
  month = {Mar},
  abstract = {Knowledge of protein-protein interaction is useful for elucidating
	protein function via the concept of 'guilt-by-association'. {A} statistical
	learning method, {S}upport {V}ector {M}achine ({SVM}), has recently
	been explored for the prediction of protein-protein interactions
	using artificial shuffled sequences as hypothetical noninteracting
	proteins and it has shown promising results ({B}ock, {J}. {R}., {G}ough,
	{D}. {A}., {B}ioinformatics 2001, 17, 455-460). {I}t remains unclear
	however, how the prediction accuracy is affected if real protein
	sequences are used to represent noninteracting proteins. {I}n this
	work, this effect is assessed by comparison of the results derived
	from the use of real protein sequences with that derived from the
	use of shuffled sequences. {T}he real protein sequences of hypothetical
	noninteracting proteins are generated from an exclusion analysis
	in combination with subcellular localization information of interacting
	proteins found in the {D}atabase of {I}nteracting {P}roteins. {P}rediction
	accuracy using real protein sequences is 76.9\% compared to 94.1\%
	using artificial shuffled sequences. {T}he discrepancy likely arises
	from the expected higher level of difficulty for separating two sets
	of real protein sequences than that for separating a set of real
	protein sequences from a set of artificial sequences. {T}he use of
	real protein sequences for training a {SVM} classification system
	is expected to give better prediction results in practical cases.
	{T}his is tested by using both {SVM} systems for predicting putative
	protein partners of a set of thioredoxin related proteins. {T}he
	prediction results are consistent with observations, suggesting that
	real sequence is more practically useful in development of {SVM}
	classification system for facilitating protein-protein interaction
	prediction.},
  doi = {10.1002/pmic.200401118},
  pdf = {../local/Lo2005Effect.pdf},
  file = {Lo2005Effect.pdf:local/Lo2005Effect.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1002/pmic.200401118}
}
@article{Lodhi2002Text,
  author = {Lodhi, H. and Saunders, C. and Shawe-Taylor, J. and Cristianini,
	N. and Watkins, C.je n'ai pas vraiment d'éléments de réponse.},
  title = {Text classification using string kernels},
  journal = {J. {M}ach. {L}earn. {R}es.},
  year = {2002},
  volume = {2},
  pages = {419--444},
  pdf = {../local/lodh02.pdf},
  file = {lodh02.pdf:local/lodh02.pdf:PDF},
  keywords = {biosvm},
  subject = {kernel},
  url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/lodhi02a/abstract.html}
}
@inproceedings{Lodhi2000Text,
  author = {Lodhi, H. and Shawe-Taylor, J. and Cristianini, N. and Watkins, C.
	J. C. H.},
  title = {Text {C}lassification using {S}tring {K}ernels},
  booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.},
  year = {2000},
  pages = {563-569},
  pdf = {../local/lodh00.pdf},
  file = {lodh00.pdf:local/lodh00.pdf:PDF},
  keywords = {biosvm},
  subject = {kernel},
  url = {http://www.neurocolt.com/tech_reps/2000/00079.ps.gz}
}
@techreport{Logan2001Study,
  author = {Logan, B. and Moreno, P. and Suzek, B. and Weng, Z. and Kasif, S.},
  title = {A {S}tudy of {R}emote {H}omology {D}etection},
  institution = {Compaq Cambridge Research laboratory},
  year = {2001},
  number = {CRL 2001/05},
  month = {June},
  abstract = {Functional annotation of newly sequenced genomes is an important challenge
	for computational biology systems. {W}hile much progress has been
	made towards scalingup experimental methods for functional assignment
	to putative genes, most current genomic annotation systems rely on
	computational solutions for homology modeling via sequence or structural
	similarity. {W}e present a new method for remote homology detection
	that relies on combining probabilistic modeling and supervised learning
	in high-dimensional features spaces. {O}ur system uses a transformation
	that converts protein domains to fixed-dimension representative feature
	vectors, where each feature records the sensitivity of each protein
	domain to a previously learned set of ?protein motifs? or ?blocks?.
	{S}ubsequently, the system utilizes {S}upport {V}ector {M}achine
	({SVM}) classifiers to learn the boundaries between structural protein
	classes. {O}ur experiments suggest that this technique performs well
	relative to several other remote homology methods for the majority
	of protein domains in {SCOP} 1.37 {PDB}90.},
  pdf = {../local/Logan2001Study.pdf},
  file = {Logan2001Study.pdf:local/Logan2001Study.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Lu2003Expression,
  author = {Lu, Y.J. and Williamson, D. and Wang, R. and Summersgill, B. and
	Rodriguez, S. and Rogers, S. and Pritchard-Jones, K. and Campbell,
	C. and Shipley, J.},
  title = {Expression profiling targeting chromosomes for tumor classification
	and prediction of clinical behavior.},
  journal = {Genes {C}hromosomes {C}ancer},
  year = {2003},
  volume = {38},
  pages = {207-214},
  number = {3},
  abstract = {Tumors are associated with altered or deregulated gene products that
	affect critical cellular functions. {H}ere we assess the use of a
	global expression profiling technique that identifies chromosome
	regions corresponding to differential gene expression, termed comparative
	expressed sequence hybridization ({CESH}). {CESH} analysis was performed
	on a total of 104 tumors with a diagnosis of rhabdomyosarcoma, leiomyosarcoma,
	prostate cancer, and favorable-histology {W}ilms tumors. {T}hrough
	the use of the chromosome regions identified as variables, support
	vector machine analysis was applied to assess classification potential,
	and feature selection (recursive feature elimination) was used to
	identify the best discriminatory regions. {W}e demonstrate that the
	{CESH} profiles have characteristic patterns in tumor groups and
	were also able to distinguish subgroups of rhabdomyosarcoma. {T}he
	overall {CESH} profiles in favorable-histology {W}ilms tumors were
	found to correlate with subsequent clinical behavior. {C}lassification
	by use of {CESH} profiles was shown to be similar in performance
	to previous microarray expression studies and highlighted regions
	for further investigation. {W}e conclude that analysis of chromosomal
	expression profiles can group, subgroup, and even predict clinical
	behavior of tumors to a level of performance similar to that of microarray
	analysis. {CESH} is independent of selecting sequences for interrogation
	and is a simple, rapid, and widely accessible approach to identify
	clinically useful differential expression.},
  doi = {10.1002/gcc.10276},
  pdf = {../local/Lu2003Expression.pdf},
  file = {Lu2003Expression.pdf:local/Lu2003Expression.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Luan2005Classification,
  author = {Feng Luan and Ruisheng Zhang and Chunyan Zhao and Xiaojun Yao and
	Mancang Liu and Zhide Hu and Botao Fan},
  title = {Classification of the carcinogenicity of {N}-nitroso compounds based
	on support vector machines and linear discriminant analysis.},
  journal = {Chem {R}es {T}oxicol},
  year = {2005},
  volume = {18},
  pages = {198-203},
  number = {2},
  month = {Feb},
  abstract = {The support vector machine ({SVM}), as a novel type of learning machine,
	was used to develop a classification model of carcinogenic properties
	of 148 {N}-nitroso compounds. {T}he seven descriptors calculated
	solely from the molecular structures of compounds selected by forward
	stepwise linear discriminant analysis ({LDA}) were used as inputs
	of the {SVM} model. {T}he obtained results confirmed the discriminative
	capacity of the calculated descriptors. {T}he result of {SVM} (total
	accuracy of 95.2\%) is better than that of {LDA} (total accuracy
	of 89.8\%).},
  doi = {10.1021/tx049782q},
  pdf = {../local/Luan2005Classification.pdf},
  file = {Luan2005Classification.pdf:local/Luan2005Classification.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/tx049782q}
}
@article{Ma2006MSB,
  author = {Wenzhe Ma and Luhua Lai and Qi Ouyang and Chao Tang},
  title = {Robustness and modular design of the Drosophila segment polarity
	network.},
  journal = {Mol Syst Biol},
  year = {2006},
  volume = {2},
  pages = {70},
  abstract = {Biomolecular networks have to perform their functions robustly. A
	robust function may have preferences in the topological structures
	of the underlying network. We carried out an exhaustive computational
	analysis on network topologies in relation to a patterning function
	in Drosophila embryogenesis. We found that whereas the vast majority
	of topologies can either not perform the required function or only
	do so very fragilely, a small fraction of topologies emerges as particularly
	robust for the function. The topology adopted by Drosophila, that
	of the segment polarity network, is a top ranking one among all topologies
	with no direct autoregulation. Furthermore, we found that all robust
	topologies are modular-each being a combination of three kinds of
	modules. These modules can be traced back to three subfunctions of
	the patterning function, and their combinations provide a combinatorial
	variability for the robust topologies. Our results suggest that the
	requirement of functional robustness drastically reduces the choices
	of viable topology to a limited set of modular combinations among
	which nature optimizes its choice under evolutionary and other biological
	constraints.},
  doi = {10.1038/msb4100111},
  institution = {Center for Theoretical Biology, Peking University, Beijing, China.},
  keywords = {Animals; Biological Evolution; Body Patterning; Computer Simulation;
	Drosophila Proteins, physiology; Drosophila melanogaster, anatomy
	/&/ histology/physiology; Feedback, Physiological; Gene Expression
	Regulation, Developmental; Genes, Insect; Models, Biological; Signal
	Transduction; Systems Biology, methods; Transcription Factors},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pii = {msb4100111},
  pmid = {17170765},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1038/msb4100111}
}
@article{Madeira2004Biclustering,
  author = {Madeira, S. C. and Oliveira, A. L.},
  title = {Biclustering algorithms for biological data analysis: a survey.},
  journal = {IEEE/ACM Trans Comput Biol Bioinform},
  year = {2004},
  volume = {1},
  pages = {24--45},
  number = {1},
  abstract = {A large number of clustering approaches have been proposed for the
	analysis of gene expression data obtained from microarray experiments.
	However, the results from the application of standard clustering
	methods to genes are limited. This limitation is imposed by the existence
	of a number of experimental conditions where the activity of genes
	is uncorrelated. A similar limitation exists when clustering of conditions
	is performed. For this reason, a number of algorithms that perform
	simultaneous clustering on the row and column dimensions of the data
	matrix has been proposed. The goal is to find submatrices, that is,
	subgroups of genes and subgroups of conditions, where the genes exhibit
	highly correlated activities for every condition. In this paper,
	we refer to this class of algorithms as biclustering. Biclustering
	is also referred in the literature as coclustering and direct clustering,
	among others names, and has also been used in fields such as information
	retrieval and data mining. In this comprehensive survey, we analyze
	a large number of existing approaches to biclustering, and classify
	them in accordance with the type of biclusters they can find, the
	patterns of biclusters that are discovered, the methods used to perform
	the search, the approaches used to evaluate the solution, and the
	target applications.},
  doi = {10.1109/TCBB.2004.2},
  institution = {University of Beira Interior, Rua Marquês D'Avila e Bolama, Covilhã,
	Portugal. smadeira@di.ubi.pt},
  keywords = {Algorithms; Cluster Analysis; Computational Biology, methods; Gene
	Expression Profiling, statistics /&/ numerical data; Gene Expression,
	genetics; Humans; Models, Statistical; Oligonucleotide Array Sequence
	Analysis, methods; Saccharomyces cerevisiae, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pmid = {17048406},
  timestamp = {2012.02.27},
  url = {http://dx.doi.org/10.1109/TCBB.2004.2}
}
@inproceedings{Mahe2004Extensions,
  author = {Mah{\'e}, P. and Ueda, N. and Akutsu, T. and Perret, J.-L. and Vert,
	J.-P.},
  title = {Extensions of marginalized graph kernels},
  booktitle = {Proceedings of the {T}wenty-{F}irst {I}nternational {C}onference
	on {M}achine {L}earning ({ICML} 2004)},
  year = {2004},
  editor = {Greiner, R. and Schuurmans, D.},
  pages = {552-559},
  publisher = {ACM Press},
  abstract = {Positive definite kernels between labeled graphs have recently been
	proposed.{T}hey enable the application of kernel methods, such as
	support vectormachines, to the analysis and classification of graphs,
	for example, chemicalcompounds. {T}hese graph kernels are obtained
	by marginalizing a kernel betweenpaths with respect to a random walk
	model on the graph vertices along theedges. {W}e propose two extensions
	of these graph kernels, with the double goal toreduce their computation
	time and increase their relevance as measure ofsimilarity between
	graphs. {F}irst, we propose to modify the label of eachvertex by
	automatically adding information about its environment with the useof
	the {M}organ algorithm. {S}econd, we suggest a modification of the
	random walkmodel to prevent the walk from coming back to a vertex
	that was just visited.{T}hese extensions are then tested on benchmark
	experiments of chemicalcompounds classification, with promising results.},
  pdf = {../local/icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF;icmlMod.pdf:http\},
  file = {icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF;icmlMod.pdf:http\://cg.ensmp.fr/~vert/publi/04icml/icmlMod.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  owner = {vert}
}
@article{Mahe2005Graph,
  author = {Mah{\'e}, P. and Ueda, N. and Akutsu, T. and Perret, J.-L. and Vert,
	J.-P.},
  title = {Graph kernels for molecular structure-activity relationship analysis
	with support vector machines},
  journal = {J. Chem. Inf. Model.},
  year = {2005},
  volume = {45},
  pages = {939-51},
  number = {4},
  abstract = {The support vector machine algorithm together with graph kernel functions
	has recently been introduced to model structure-activity relationships
	({SAR}) of molecules from their 2{D} structure, without the need
	for explicit molecular descriptor computation. {W}e propose two extensions
	to this approach with the double goal to reduce the computational
	burden associated with the model and to enhance its predictive accuracy:
	description of the molecules by a {M}organ index process and definition
	of a second-order {M}arkov model for random walks on 2{D} structures.
	{E}xperiments on two mutagenicity data sets validate the proposed
	extensions, making this approach a possible complementary alternative
	to other modeling strategies.},
  doi = {10.1021/ci050039t},
  pdf = {../local/Mahe2005Graph.pdf},
  file = {Mahe2005Graph.pdf:local/Mahe2005Graph.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci050039t}
}
@article{Majoros2005Efficient,
  author = {Majoros, W. H. and Pertea, L. and Salzberg, S. L.},
  title = {Efficient implementation of a generalized pair hidden {M}arkov model
	for comparative gene finding.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1782--1788},
  number = {9},
  month = {May},
  abstract = {M{OTIVATION}: {T}he increased availability of genome sequences of
	closely related organisms has generated much interest in utilizing
	homology to improve the accuracy of gene prediction programs. {G}eneralized
	pair hidden {M}arkov models ({GPHMM}s) have been proposed as one
	means to address this need. {H}owever, all {GPHMM} implementations
	currently available are either closed-source or the details of their
	operation are not fully described in the literature, leaving a significant
	hurdle for others wishing to advance the state of the art in {GPHMM}
	design. {RESULTS}: {W}e have developed an open-source {GPHMM} gene
	finder, {TWAIN}, which performs very well on two related {A}spergillus
	species, {A}.fumigatus and {A}.nidulans, finding 89\% of the exons
	and predicting 74\% of the gene models exactly correctly in a test
	set of 147 conserved gene pairs. {W}e describe the implementation
	of this {GPHMM} and we explicitly address the assumptions and limitations
	of the system. {W}e suggest possible ways of relaxing those assumptions
	to improve the utility of the system without sacrificing efficiency
	beyond what is practical. {AVAILABILITY}: {A}vailable at http://www.tigr.org/software/pirate/twain/twain.html
	under the open-source {A}rtistic {L}icense.},
  doi = {10.1093/bioinformatics/bti297},
  pdf = {../local/Majoros2005Efficient.pdf},
  file = {Majoros2005Efficient.pdf:local/Majoros2005Efficient.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {bti297},
  pmid = {15691859},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti297}
}
@article{Man2004Evaluating,
  author = {Man, M.Z. and Dyson, G. and Johnson, K. and Liao, B.},
  title = {Evaluating methods for classifying expression data.},
  journal = {J. {B}iopharm. {S}tat.},
  year = {2004},
  volume = {14},
  pages = {1065-1084},
  number = {4},
  abstract = {An attractive application of expression technologies is to predict
	drug efficacy or safety using expression data of biomarkers. {T}o
	evaluate the performance of various classification methods for building
	predictive models, we applied these methods on six expression datasets.
	{T}hese datasets were from studies using microarray technologies
	and had either two or more classes. {F}rom each of the original datasets,
	two subsets were generated to simulate two scenarios in biomarker
	applications. {F}irst, a 50-gene subset was used to simulate a candidate
	gene approach when it might not be practical to measure a large number
	of genes/biomarkers. {N}ext, a 2000-gene subset was used to simulate
	a whole genome approach. {W}e evaluated the relative performance
	of several classification methods by using leave-one-out cross-validation
	and bootstrap cross-validation. {A}lthough all methods perform well
	in both subsets for a relative easy dataset with two classes, differences
	in performance do exist among methods for other datasets. {O}verall,
	partial least squares discriminant analysis ({PLS}-{DA}) and support
	vector machines ({SVM}) outperform all other methods. {W}e suggest
	a practical approach to take advantage of multiple methods in biomarker
	applications.},
  doi = {10.1081/BIP-200035491},
  pdf = {../local/Man2004Evaluating.pdf},
  file = {Man2004Evaluating.pdf:local/Man2004Evaluating.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Mao2005Multiclass,
  author = {Yong Mao and Xiaobo Zhou and Daoying Pi and Youxian Sun and Stephen
	T C Wong},
  title = {Multiclass cancer classification by using fuzzy support vector machine
	and binary decision tree with gene selection.},
  journal = {J {B}iomed {B}iotechnol},
  year = {2005},
  volume = {2005},
  pages = {160-71},
  number = {2},
  abstract = {We investigate the problems of multiclass cancer classification with
	gene selection from gene expression data. {T}wo different constructed
	multiclass classifiers with gene selection are proposed, which are
	fuzzy support vector machine ({FSVM}) with gene selection and binary
	classification tree based on {SVM} with gene selection. {U}sing {F}
	test and recursive feature elimination based on {SVM} as gene selection
	methods, binary classification tree based on {SVM} with {F} test,
	binary classification tree based on {SVM} with recursive feature
	elimination based on {SVM}, and {FSVM} with recursive feature elimination
	based on {SVM} are tested in our experiments. {T}o accelerate computation,
	preselecting the strongest genes is also used. {T}he proposed techniques
	are applied to analyze breast cancer data, small round blue-cell
	tumors, and acute leukemia data. {C}ompared to existing multiclass
	cancer classifiers and binary classification tree based on {SVM}
	with {F} test or binary classification tree based on {SVM} with recursive
	feature elimination based on {SVM} mentioned in this paper, {FSVM}
	based on recursive feature elimination based on {SVM} can find most
	important genes that affect certain types of cancer with high recognition
	accuracy.},
  doi = {10.1155/JBB.2005.160},
  pdf = {../local/Mao2005Multiclass.pdf},
  file = {Mao2005Multiclass.pdf:local/Mao2005Multiclass.pdf:PDF},
  keywords = {biosvm},
  pii = {S1110724304406044_THIS_PII_IS_INCORRECT_},
  url = {http://dx.doi.org/10.1155/JBB.2005.160}
}
@article{Marbach2009Replaying,
  author = {Marbach, D. and Mattiussi, C. and Floreano, D.},
  title = {Replaying the evolutionary tape: biomimetic reverse engineering of
	gene networks.},
  journal = {Ann N Y Acad Sci},
  year = {2009},
  volume = {1158},
  pages = {234--245},
  month = {Mar},
  abstract = {In this paper, we suggest a new approach for reverse engineering gene
	regulatory networks, which consists of using a reconstruction process
	that is similar to the evolutionary process that created these networks.
	The aim is to integrate prior knowledge into the reverse-engineering
	procedure, thus biasing the search toward biologically plausible
	solutions. To this end, we propose an evolutionary method that abstracts
	and mimics the natural evolution of gene regulatory networks. Our
	method can be used with a wide range of nonlinear dynamical models.
	This allows us to explore novel model types such as the log-sigmoid
	model introduced here. We apply the biomimetic method to a gold-standard
	dataset from an in vivo gene network. The obtained results won a
	reverse engineering competition of the second DREAM conference (Dialogue
	on Reverse Engineering Assessments and Methods 2007, New York, NY).},
  doi = {10.1111/j.1749-6632.2008.03944.x},
  institution = {Laboratory of Intelligent Systems, Ecole Polytechnique Fédérale de
	Lausanne, Lausanne, Switzerland.},
  keywords = {Algorithms; Biomimetics; Computational Biology; Databases, Genetic;
	Evolution; Gene Regulatory Networks; Models, Biological; Nonlinear
	Dynamics},
  owner = {fantine},
  pii = {NYAS03944},
  pmid = {19348645},
  timestamp = {2010.10.19},
  url = {http://dx.doi.org/10.1111/j.1749-6632.2008.03944.x}
}
@article{Marcotte1999Detecting,
  author = {Marcotte, E.M. and Pellegrini, M. and Ng, H.-L. and Rice, D.W. and
	Yeates, T.O. and Eisenberg, D.},
  title = {Detecting {P}rotein {F}unction and {P}rotein-{P}rotein {I}nteractions
	from {G}enome {S}equences},
  journal = {Science},
  year = {1999},
  volume = {285},
  pages = {751--753},
  pdf = {../local/marc99b.pdf},
  file = {marc99b.pdf:local/marc99b.pdf:PDF},
  subject = {bio},
  url = {http://www.sciencemag.org/cgi/reprint/285/5428/751.pdf}
}
@article{Marcotte1999combined,
  author = {Marcotte, E. M. and Pellegrini, M. and Thompson, M. J. and Yeates,
	T. O. and Eisenberg, D.},
  title = {A combined algorithm for genome-wide prediction of protein function},
  journal = {Nature},
  year = {1999},
  volume = {402},
  pages = {83--86},
  month = {November},
  pdf = {../local/marc99.pdf},
  file = {marc99.pdf:local/marc99.pdf:PDF},
  subject = {bio},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v402/n6757/full/402083a0_fs.html&content_filetype=PDF}
}
@article{Markowetz2010How,
  author = {Florian Markowetz},
  title = {How to understand the cell by breaking it: network analysis of gene
	perturbation screens.},
  journal = {PLoS Comput Biol},
  year = {2010},
  volume = {6},
  pages = {e1000655},
  number = {2},
  doi = {10.1371/journal.pcbi.1000655},
  institution = {Cancer Research UK Cambridge Research Institute, Cambridge, United
	Kingdom.},
  keywords = {Animals; Cell Physiological Processes; Cluster Analysis; Gene Regulatory
	Networks; Genomics; Humans; Models, Genetic; Models, Statistical;
	Phenotype; Signal Transduction; Systems Biology},
  owner = {phupe},
  pmid = {20195495},
  timestamp = {2010.08.30},
  url = {http://dx.doi.org/10.1371/journal.pcbi.1000655}
}
@article{Markowetz2003Support,
  author = {F. Markowetz and L. Edler and M. Vingron},
  title = {Support {V}ector {M}achines for {P}rotein {F}old {C}lass {P}rediction},
  journal = {Biometrical {J}ournal},
  year = {2003},
  volume = {45},
  pages = {377-389},
  number = {3},
  abstract = {Knowledge of the three-dimensional structure of a protein is essential
	for describing and understanding its function. {T}oday, a large number
	of known protein sequences faces a small number of identified structures.
	{T}hus, the need arises to predict structure from sequence without
	using time-consuming experimental identification. {I}n this paper
	the performance of {S}upport {V}ector {M}achines ({SVM}s) is compared
	to {N}eural {N}etworks and to standard statistical classification
	methods as {D}iscriminant {A}nalysis and {N}earest {N}eighbor {C}lassification.
	{W}e show that {SVM}s can beat the competing methods on a dataset
	of 268 protein sequences to be classified into a set of 42 fold classes.
	{W}e discuss misclassification with respect to biological function
	and similarity. {I}n a second step we examine the performance of
	{SVM}s if the embedding is varied from frequencies of single amino
	acids to frequencies of tripletts of amino acids. {T}his work shows
	that {SVM} provide a promising alternative to standard statistical
	classification and prediction methods in functional genomics.},
  doi = {10.1002/bimj.200390019},
  pdf = {../local/Markowetz2003Support.pdf},
  file = {Markowetz2003Support.pdf:local/Markowetz2003Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www3.interscience.wiley.com/cgi-bin/abstract/104525729/START}
}
@article{Marsland2002self-organising,
  author = {Stephen Marsland and Jonathan Shapiro and Ulrich Nehmzow},
  title = {A self-organising network that grows when required.},
  journal = {Neural {N}etw},
  year = {2002},
  volume = {15},
  pages = {1041-58},
  number = {8-9},
  abstract = {The ability to grow extra nodes is a potentially useful facility for
	a self-organising neural network. {A} network that can add nodes
	into its map space can approximate the input space more accurately,
	and often more parsimoniously, than a network with predefined structure
	and size, such as the {S}elf-{O}rganising {M}ap. {I}n addition, a
	growing network can deal with dynamic input distributions. {M}ost
	of the growing networks that have been proposed in the literature
	add new nodes to support the node that has accumulated the highest
	error during previous iterations or to support topological structures.
	{T}his usually means that new nodes are added only when the number
	of iterations is an integer multiple of some pre-defined constant,
	{A}. {T}his paper suggests a way in which the learning algorithm
	can add nodes whenever the network in its current state does not
	sufficiently match the input. {I}n this way the network grows very
	quickly when new data is presented, but stops growing once the network
	has matched the data. {T}his is particularly important when we consider
	dynamic data sets, where the distribution of inputs can change to
	a new regime after some time. {W}e also demonstrate the preservation
	of neighbourhood relations in the data by the network. {T}he new
	network is compared to an existing growing network, the {G}rowing
	{N}eural {G}as ({GNG}), on a artificial dataset, showing how the
	network deals with a change in input distribution after some time.
	{F}inally, the new network is applied to several novelty detection
	tasks and is compared with both the {GNG} and an unsupervised form
	of the {R}educed {C}oulomb {E}nergy network on a robotic inspection
	task and with a {S}upport {V}ector {M}achine on two benchmark novelty
	detection tasks.},
  keywords = {Acute, Algorithms, Animals, Anion Exchange Resins, Artificial Intelligence,
	Automated, Base Pair Mismatch, Base Pairing, Base Sequence, Biological,
	Biosensing Techniques, Carcinoma, Chemical, Chromatography, Citric
	Acid Cycle, Classification, Cluster Analysis, Comparative Study,
	Computational Biology, Computer-Assisted, Cystadenoma, DNA, Databases,
	Decision Making, Diagnosis, Differential, Drug, Drug Design, Electrostatics,
	Eukaryotic Cells, Factual, Feasibility Studies, Female, Gene Expression,
	Gene Expression Profiling, Gene Expression Regulation, Genes, Genetic,
	Genetic Heterogeneity, Genetic Markers, Hemolysins, Humans, Internet,
	Ion Exchange, Leukemia, Ligands, Likelihood Functions, Logistic Models,
	Lung Neoplasms, Lymphocytic, Lymphoma, Markov Chains, Mathematics,
	Messenger, Models, Molecular, Molecular Probe Techniques, Molecular
	Sequence Data, Nanotechnology, Neoplasm, Neoplasms, Neoplastic, Neural
	Networks (Computer), Non-P.H.S., Non-Small-Cell Lung, Non-U.S. Gov't,
	Nucleic Acid Conformation, Nucleic Acid Hybridization, Observer Variation,
	Oligonucleotide Array Sequence Analysis, Ovarian Neoplasms, P.H.S.,
	Pattern Recognition, Probability, Probability Learning, Protein Binding,
	Protein Conformation, Proteins, Quality Control, Quantum Theory,
	RNA, RNA Splicing, Receptors, Reference Values, Regression Analysis,
	Reproducibility of Results, Research Support, Robotics, Saccharomyces
	cerevisiae Proteins, Sensitivity and Specificity, Sequence Analysis,
	Signal Processing, Software, Statistical, Stomach Neoplasms, Structural,
	Structure-Activity Relationship, Thermodynamics, Transcription, Tumor
	Markers, U.S. Gov't, 12416693}
}
@article{Martin2005Predicting,
  author = {Martin, S. and Roe, D. and Faulon, J.-L.},
  title = {Predicting protein-protein interactions using signature products},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {218-226},
  number = {2},
  month = {Jan},
  abstract = {Motivation: {P}roteome-wide prediction of protein-protein interaction
	is a difficult and important problem in biology. {A}lthough there
	have been recent advances in both experimental and computational
	methods for predicting protein-protein interactions, we are only
	beginning to see a confluence of these techniques. {I}n this paper,
	we describe a very general, high-throughput method for predicting
	protein-protein interactions. {O}ur method combines a sequence-based
	description of proteins with experimental information that can be
	gathered from any type of protein-protein interaction screen. {T}he
	method uses a novel description of interacting proteins by extending
	the signature descriptor, which has demonstrated success in predicting
	peptide/protein binding interactions for individual proteins. {T}his
	descriptor is extended to protein pairs by taking signature products.
	{T}he signature product is implemented within a support vector machine
	classifier as a kernel function. {R}esults: {W}e have applied our
	method to publicly available yeast, {H}elicobacter pylori, human
	and mouse datasets. {W}e used the yeast and {H}.pylori datasets to
	verify the predictive ability of our method, achieving from 70 to
	80% accuracy rates using 10-fold cross-validation. {W}e used the
	human and mouse datasets to demonstrate that our method is capable
	of cross-species prediction. {F}inally, we reused the yeast dataset
	to explore the ability of our algorithm to predict domains. {C}ontact:
	smartin@sandia.gov.},
  doi = {10.1093/bioinformatics/bth483},
  pdf = {../local/Martin2005Predicting.pdf},
  file = {Martin2005Predicting.pdf:local/Martin2005Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/21/2/218}
}
@article{Martoglio2002decomposition,
  author = {Ann-Marie Martoglio and James W Miskin and Stephen K Smith and David
	J C MacKay},
  title = {A decomposition model to track gene expression signatures: preview
	on observer-independent classification of ovarian cancer.},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {1617-24},
  number = {12},
  month = {Dec},
  abstract = {M{OTIVATION}: {A} number of algorithms and analytical models have
	been employed to reduce the multidimensional complexity of {DNA}
	array data and attempt to extract some meaningful interpretation
	of the results. {T}hese include clustering, principal components
	analysis, self-organizing maps, and support vector machine analysis.
	{E}ach method assumes an implicit model for the data, many of which
	separate genes into distinct clusters defined by similar expression
	profiles in the samples tested. {A} point of concern is that many
	genes may be involved in a number of distinct behaviours, and should
	therefore be modelled to fit into as many separate clusters as detected
	in the multidimensional gene expression space. {T}he analysis of
	gene expression data using a decomposition model that is independent
	of the observer involved would be highly beneficial to improve standard
	and reproducible classification of clinical and research samples.
	{RESULTS}: {W}e present a variational independent component analysis
	({ICA}) method for reducing high dimensional {DNA} array data to
	a smaller set of latent variables, each associated with a gene signature.
	{W}e present the results of applying the method to data from an ovarian
	cancer study, revealing a number of tissue type-specific and tissue
	type-independent gene signatures present in varying amounts among
	the samples surveyed. {T}he observer independent results of such
	molecular analysis of biological samples could help identify patients
	who would benefit from different treatment strategies. {W}e further
	explore the application of the model to similar high-throughput studies.},
  keywords = {Acute, Algorithms, Automated, Base Pair Mismatch, Base Pairing, Base
	Sequence, Biological, Biosensing Techniques, Cluster Analysis, Comparative
	Study, Computer-Assisted, Cystadenoma, DNA, Female, Gene Expression,
	Gene Expression Profiling, Gene Expression Regulation, Genes, Genetic,
	Genetic Markers, Hemolysins, Humans, Leukemia, Lymphocytic, Markov
	Chains, Messenger, Models, Molecular Probe Techniques, Molecular
	Sequence Data, Nanotechnology, Neoplasm, Neoplastic, Neural Networks
	(Computer), Non-U.S. Gov't, Nucleic Acid Conformation, Observer Variation,
	Oligonucleotide Array Sequence Analysis, Ovarian Neoplasms, Pattern
	Recognition, Quality Control, RNA, Reference Values, Reproducibility
	of Results, Research Support, Sensitivity and Specificity, Signal
	Processing, Statistical, Stomach Neoplasms, Transcription, Tumor
	Markers, 12490446}
}
@article{Maslov2002Specificity,
  author = {Maslov, S. and Sneppen, K.},
  title = {Specificity and stability in topology of protein networks},
  journal = {Science},
  year = {2002},
  volume = {296},
  pages = {910--913},
  pdf = {../local/masl02.pdf},
  file = {masl02.pdf:local/masl02.pdf:PDF},
  subject = {bionet},
  url = {http://www.sciencemag.org/cgi/reprint/296/5569/910.pdf}
}
@article{Mateos2002Systematic,
  author = {Alvaro Mateos and Joaquín Dopazo and Ronald Jansen and Yuhai Tu
	and Mark Gerstein and Gustavo Stolovitzky},
  title = {Systematic learning of gene functional classes from {DNA} array expression
	data by using multilayer perceptrons.},
  journal = {Genome {R}es.},
  year = {2002},
  volume = {12},
  pages = {1703-15},
  number = {11},
  month = {Nov},
  abstract = {Recent advances in microarray technology have opened new ways for
	functional annotation of previously uncharacterised genes on a genomic
	scale. {T}his has been demonstrated by unsupervised clustering of
	co-expressed genes and, more importantly, by supervised learning
	algorithms. {U}sing prior knowledge, these algorithms can assign
	functional annotations based on more complex expression signatures
	found in existing functional classes. {P}reviously, support vector
	machines ({SVM}s) and other machine-learning methods have been applied
	to a limited number of functional classes for this purpose. {H}ere
	we present, for the first time, the comprehensive application of
	supervised neural networks ({SNN}s) for functional annotation. {O}ur
	study is novel in that we report systematic results for ~100 classes
	in the {M}unich {I}nformation {C}enter for {P}rotein {S}equences
	({MIPS}) functional catalog. {W}e found that only ~10\% of these
	are learnable (based on the rate of false negatives). {A} closer
	analysis reveals that false positives (and negatives) in a machine-learning
	context are not necessarily "false" in a biological sense. {W}e show
	that the high degree of interconnections among functional classes
	confounds the signatures that ought to be learned for a unique class.
	{W}e term this the "{B}orges effect" and introduce two new numerical
	indices for its quantification. {O}ur analysis indicates that classification
	systems with a lower {B}orges effect are better suitable for machine
	learning. {F}urthermore, we introduce a learning procedure for combining
	false positives with the original class. {W}e show that in a few
	iterations this process converges to a gene set that is learnable
	with considerably low rates of false positives and negatives and
	contains genes that are biologically related to the original class,
	allowing for a coarse reconstruction of the interactions between
	associated biological pathways. {W}e exemplify this methodology using
	the well-studied tricarboxylic acid cycle.},
  doi = {10.1101/gr.192502},
  pdf = {../local/Mateos2002Systematic.pdf},
  file = {Mateos2002Systematic.pdf:local/Mateos2002Systematic.pdf:PDF},
  keywords = {Acute, Algorithms, Animals, Anion Exchange Resins, Artificial Intelligence,
	Automated, Base Pair Mismatch, Base Pairing, Base Sequence, Biological,
	Biosensing Techniques, Carcinoma, Chemical, Chromatography, Citric
	Acid Cycle, Classification, Cluster Analysis, Comparative Study,
	Computational Biology, Computer-Assisted, Cystadenoma, DNA, Databases,
	Decision Making, Diagnosis, Differential, Drug, Drug Design, Electrostatics,
	Eukaryotic Cells, Factual, Feasibility Studies, Female, Gene Expression,
	Gene Expression Profiling, Gene Expression Regulation, Genes, Genetic,
	Genetic Heterogeneity, Genetic Markers, Hemolysins, Humans, Internet,
	Ion Exchange, Leukemia, Ligands, Likelihood Functions, Logistic Models,
	Lung Neoplasms, Lymphocytic, Lymphoma, Markov Chains, Mathematics,
	Messenger, Models, Molecular, Molecular Probe Techniques, Molecular
	Sequence Data, Nanotechnology, Neoplasm, Neoplasms, Neoplastic, Neural
	Networks (Computer), Non-P.H.S., Non-Small-Cell Lung, Non-U.S. Gov't,
	Nucleic Acid Conformation, Nucleic Acid Hybridization, Observer Variation,
	Oligonucleotide Array Sequence Analysis, Ovarian Neoplasms, P.H.S.,
	Pattern Recognition, Probability, Protein Binding, Protein Conformation,
	Proteins, Quality Control, Quantum Theory, RNA, RNA Splicing, Receptors,
	Reference Values, Regression Analysis, Reproducibility of Results,
	Research Support, Saccharomyces cerevisiae Proteins, Sensitivity
	and Specificity, Sequence Analysis, Signal Processing, Software,
	Statistical, Stomach Neoplasms, Structural, Structure-Activity Relationship,
	Thermodynamics, Transcription, Tumor Markers, U.S. Gov't, 12421757},
  url = {http://dx.doi.org/10.1101/gr.192502}
}
@article{Mathews1999Expandeda,
  author = {D. H. Mathews and J. Sabina and M. Zuker and D. H. Turner},
  title = {{E}xpanded sequence dependence of thermodynamic parameters improves
	prediction of {RNA} secondary structure.},
  journal = {J. Mol. Biol.},
  year = {1999},
  volume = {288},
  pages = {911--940},
  number = {5},
  month = {May},
  abstract = {An improved dynamic programming algorithm is reported for RNA secondary
	structure prediction by free energy minimization. Thermodynamic parameters
	for the stabilities of secondary structure motifs are revised to
	include expanded sequence dependence as revealed by recent experiments.
	Additional algorithmic improvements include reduced search time and
	storage for multibranch loop free energies and improved imposition
	of folding constraints. An extended database of 151,503 nt in 955
	structures? determined by comparative sequence analysis was assembled
	to allow optimization of parameters not based on experiments and
	to test the accuracy of the algorithm. On average, the predicted
	lowest free energy structure contains 73 \% of known base-pairs when
	domains of fewer than 700 nt are folded; this compares with 64 \%
	accuracy for previous versions of the algorithm and parameters. For
	a given sequence, a set of 750 generated structures contains one
	structure that, on average, has 86 \% of known base-pairs. Experimental
	constraints, derived from enzymatic and flavin mononucleotide cleavage,
	improve the accuracy of structure predictions.},
  doi = {10.1006/jmbi.1999.2700},
  keywords = {16S, 23S, 5S, Affinity, Algorithms, Aluminum Silicates, Amino Acid,
	Amino Acid Sequence, Amyloidosis, Archaeal, Bacillus, Bacterial,
	Bacterial Proteins, Bacteriophage T4, Base Sequence, Chloroplast,
	Chromatography, Circular Dichroism, Comparative Study, Computational
	Biology, Databases, Electrophoresis, Entropy, Enzyme Stability, Escherichia
	coli, Factual, Fibroblast Growth Factor 2, Flavin Mononucleotide,
	Fluorescence, Genetic, Guanidine, Humans, Huntington Disease, Kinetics,
	Light, Models, Molecular Sequence Data, Non-P.H.S., Non-U.S. Gov't,
	Nucleic Acid Conformation, P.H.S., Peptides, Phylogeny, Polyacrylamide
	Gel, Predictive Value of Tests, Protein Binding, Protein Denaturation,
	Protein Folding, Protein Structure, RNA, Radiation, Recombinant Proteins,
	Research Support, Ribosomal, Scattering, Secondary, Sequence Homology,
	Solutions, Spectrometry, Statistical, Temperature, Thermodynamics,
	Time Factors, Trinucleotide Repeat Expansion, U.S. Gov't, alpha-Amylase,
	10329189},
  owner = {vert},
  pii = {S0022-2836(99)92700-6},
  pmid = {10329189},
  timestamp = {2006.04.27},
  url = {http://dx.doi.org/10.1006/jmbi.1999.2700}
}
@article{Matsuda2005novel,
  author = {Matsuda, A. and Vert, J.-P. and Saigo, H. and Ueda, N. and Toh, H.
	and Akutsu, T.},
  title = {A novel representation of protein sequences for prediction of subcellular
	location using support vector machines},
  journal = {Protein {S}ci.},
  year = {2005},
  volume = {14},
  pages = {2804-2813},
  number = {11},
  abstract = {As the number of complete genomes rapidly increases, accurate methods
	to automatically predict the subcellular location of proteins are
	increasingly useful to help their functional annotation. {I}n order
	to improve the predictive accuracy of the many prediction methods
	developed to date, a novel representation of protein sequences is
	proposed. {T}his representation involves local compositions of amino
	acids and twin amino acids, and local frequencies of distance between
	successive (basic, hydrophobic, and other) amino acids. {F}or calculating
	the local features, each sequence is split into three parts: {N}-terminal,
	middle, and {C}-terminal. {T}he {N}-terminal part is further divided
	into four regions to consider ambiguity in the length and position
	of signal sequences. {W}e tested this representation with support
	vector machines on two data sets extracted from the {SWISS}-{PROT}
	database. {T}hrough fivefold cross-validation tests, overall accuracies
	of more than 87% and 91% were obtained for eukaryotic and prokaryotic
	proteins, respectively. {I}t is concluded that considering the respective
	features in the {N}-terminal, middle, and {C}-terminal parts is helpful
	to predict the subcellular location.},
  doi = {10.1110/ps.051597405},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1110/ps.051597405}
}
@article{Mattfeldt2003Classification,
  author = {Mattfeldt, T. and Gottfried, H.W. and Wolter, H. and Schmidt, V.
	and Kestler, H.A. and Mayer, J.},
  title = {Classification of prostatic carcinoma with artificial neural networks
	using comparative genomic hybridization and quantitative stereological
	data},
  journal = {Pathol. {R}es. {P}ract.},
  year = {2003},
  volume = {199},
  pages = {773-784},
  number = {12},
  abstract = {Staging of prostate cancer is a mainstay of treatment decisions and
	prognostication. {I}n the present study, 50 p{T}2{N}0 and 28 p{T}3{N}0
	prostatic adenocarcinomas were characterized by {G}leason grading,
	comparative genomic hybridization ({CGH}), and histological texture
	analysis based on principles of stereology and stochastic geometry.
	{T}he cases were classified by learning vector quantization and support
	vector machines. {T}he quality of classification was tested by cross-validation.
	{C}orrect prediction of stage from primary tumor data was possible
	with an accuracy of 74-80% from different data sets. {T}he accuracy
	of prediction was similar when the {G}leason score was used as input
	variable, when stereological data were used, or when a combination
	of {CGH} data and stereological data was used. {T}he results of classification
	by learning vector quantization were slightly better than those by
	support vector machines. {A} method is briefly sketched by which
	training of neural networks can be adapted to unequal sample sizes
	per class. {P}rogression from p{T}2 to p{T}3 prostate cancer is correlated
	with complex changes of the epithelial cells in terms of volume fraction,
	of surface area, and of second-order stereological properties. {G}enetically,
	this progression is accompanied by a significant global increase
	in losses and gains of {DNA}, and specifically by increased numerical
	aberrations on chromosome arms 1q, 7p, and 8p.},
  doi = {10.1078/0344-0338-00496},
  keywords = {biosvm, cgh},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1078/0344-0338-00496}
}
@article{Mavroforakis2005Significance,
  author = {Michael Mavroforakis and Harris Georgiou and Nikos Dimitropoulos
	and Dionisis Cavouras and Sergios Theodoridis},
  title = {Significance analysis of qualitative mammographic features, using
	linear classifiers, neural networks and support vector machines.},
  journal = {Eur {J} {R}adiol},
  year = {2005},
  volume = {54},
  pages = {80-9},
  number = {1},
  month = {Apr},
  abstract = {Advances in modern technologies and computers have enabled digital
	image processing to become a vital tool in conventional clinical
	practice, including mammography. {H}owever, the core problem of the
	clinical evaluation of mammographic tumors remains a highly demanding
	cognitive task. {I}n order for these automated diagnostic systems
	to perform in levels of sensitivity and specificity similar to that
	of human experts, it is essential that a robust framework on problem-specific
	design parameters is formulated. {T}his study is focused on identifying
	a robust set of clinical features that can be used as the base for
	designing the input of any computer-aided diagnosis system for automatic
	mammographic tumor evaluation. {A} thorough list of clinical features
	was constructed and the diagnostic value of each feature was verified
	against current clinical practices by an expert physician. {T}hese
	features were directly or indirectly related to the overall morphological
	properties of the mammographic tumor or the texture of the fine-scale
	tissue structures as they appear in the digitized image, while others
	contained external clinical data of outmost importance, like the
	patient's age. {T}he entire feature set was used as an annotation
	list for describing the clinical properties of mammographic tumor
	cases in a quantitative way, such that subsequent objective analyses
	were possible. {F}or the purposes of this study, a mammographic image
	database was created, with complete clinical evaluation descriptions
	and positive histological verification for each case. {A}ll tumors
	contained in the database were characterized according to the identified
	clinical features' set and the resulting dataset was used as input
	for discrimination and diagnostic value analysis for each one of
	these features. {S}pecifically, several standard methodologies of
	statistical significance analysis were employed to create feature
	rankings according to their discriminating power. {M}oreover, three
	different classification models, namely linear classifiers, neural
	networks and support vector machines, were employed to investigate
	the true efficiency of each one of them, as well as the overall complexity
	of the diagnostic task of mammographic tumor characterization. {B}oth
	the statistical and the classification results have proven the explicit
	correlation of all the selected features with the final diagnosis,
	qualifying them as an adequate input base for any type of similar
	automated diagnosis system. {T}he underlying complexity of the diagnostic
	task has justified the high value of sophisticated pattern recognition
	architectures.},
  doi = {10.1016/j.ejrad.2004.12.015},
  pdf = {../local/Mavroforakis2005Significance.pdf},
  file = {Mavroforakis2005Significance.pdf:local/Mavroforakis2005Significance.pdf:PDF},
  keywords = {Algorithms, Animals, Antibiotics, Antineoplastic, Artificial Intelligence,
	Butadienes, Chloroplasts, Comparative Study, Computer Simulation,
	Computer-Assisted, Diagnosis, Disinfectants, Dose-Response Relationship,
	Drug, Drug Toxicity, Electrodes, Electroencephalography, Ethylamines,
	Expert Systems, Feedback, Fungicides, Gene Expression Profiling,
	Genes, Genetic Markers, Humans, Implanted, Industrial, Information
	Storage and Retrieval, Kidney, Kidney Tubules, MEDLINE, Male, Mercuric
	Chloride, Microarray Analysis, Molecular Biology, Motor Cortex, Movement,
	Natural Language Processing, Neural Networks (Computer), Non-P.H.S.,
	Non-U.S. Gov't, Plant Proteins, Predictive Value of Tests, Proteins,
	Proteome, Proximal, Puromycin Aminonucleoside, Rats, Reproducibility
	of Results, Research Support, Sprague-Dawley, Subcellular Fractions,
	Terminology, Therapy, Time Factors, Toxicogenetics, U.S. Gov't, User-Computer
	Interface, 15797296},
  pii = {S0720-048X(05)00023-9},
  url = {http://dx.doi.org/10.1016/j.ejrad.2004.12.015}
}
@article{Mayr2003Cross-reactive,
  author = {Torsten Mayr and Christian Igel and Gregor Liebsch and Ingo Klimant
	and Otto S Wolfbeis},
  title = {Cross-reactive metal ion sensor array in a micro titer plate format.},
  journal = {Anal {C}hem},
  year = {2003},
  volume = {75},
  pages = {4389-96},
  number = {17},
  month = {Sep},
  abstract = {A cross-reactive array in a micro titer plate ({MTP}) format is described
	that is based on a versatile and highly flexible scheme. {I}t makes
	use of rather unspecific metal ions probes having almost identical
	fluorescence spectra, thus enabling (a) interrogation at identical
	analytical wavelengths, and (b) imaging of the probes contained in
	the wells of the {MTP} using a {CCD} camera and an array of blue-light-emitting
	diodes as a light source. {T}he unselective response of the indicators
	in the presence of mixtures of five divalent cations generates a
	characteristic pattern that was analyzed by chemometric tools. {T}he
	fluorescence intensity of the indicators was transferred into a time-dependent
	parameter applying a scheme called dual lifetime referencing. {I}n
	this method, the fluorescence decay profile of the indicator is referenced
	against the phosphorescence of an inert reference dye added to the
	system. {T}he intrinsically referenced measurements also were performed
	using blue {LED}s as light sources and a {CCD} camera without intensifiers
	as the detector. {T}he best performance was observed if each well
	was excited by a single {LED}. {T}he assembly allows the detection
	of dye concentrations in the nanomoles-per-liter range without amplification
	and the acquisition of 96 wells simultaneously. {T}he pictures obtained
	form the basis for evaluation by pattern recognition algorithms.
	{S}upport vector machines are capable of predicting the presence
	of significant concentrations of metal ions with high accuracy.},
  keywords = {Agrochemicals, Air Pollutants, Aircraft, Algorithms, Artificial Intelligence,
	Automated, Base Composition, Base Sequence, Bayes Theorem, Carbonic
	Anhydrase Inhibitors, Cluster Analysis, Colonic Neoplasms, Comparative
	Study, Computational Biology, Computer Simulation, Computer Systems,
	Computer-Assisted, Computing Methodologies, Confidence Intervals,
	Cytosine, DNA, Data Interpretation, Databases, Diagnosis, Drug Design,
	Enhancer Elements (Genetics), Environmental Monitoring, Enzyme Inhibitors,
	Ethanol, Exons, Forecasting, Fourier Transform Infrared, Gene Expression
	Profiling, Gene Expression Regulation, Genetic, Genetic Screening,
	Glucuronosyltransferase, Guanine, Humans, Image Interpretation, Isoenzymes,
	Least-Squares Analysis, Leukemia, Linear Models, Lymphoma, Models,
	Molecular, Molecular Conformation, Molecular Sequence Data, Natural
	Disasters, Neoplasms, Neoplastic, Neural Networks (Computer), Non-P.H.S.,
	Non-U.S. Gov't, Nonlinear Dynamics, Oligonucleotide Array Sequence
	Analysis, Online Systems, P.H.S., Pattern Recognition, Pharmaceutical
	Preparations, Phenotype, Photography, Probability, Pyrimidines, Quantitative
	Structure-Activity Relationship, RNA Precursors, RNA Splice Sites,
	RNA Splicing, Radiation, Reproducibility of Results, Research Support,
	Sensitivity and Specificity, Sequence Alignment, Sequence Analysis,
	Signal Processing, Software, Spectroscopy, Statistical, Subtraction
	Technique, Terminology, Thermodynamics, Time Factors, U.S. Gov't,
	Untranslated Regions, Video Recording, Walking, 14632041}
}
@article{McAuliffe2004Multiple-sequence,
  author = {McAuliffe, J. D. and Pachter, L. and Jordan, M. I.},
  title = {Multiple-sequence functional annotation and the generalized hidden
	{M}arkov phylogeny.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {1850--1860},
  number = {12},
  month = {Aug},
  abstract = {M{OTIVATION}: {P}hylogenetic shadowing is a comparative genomics principle
	that allows for the discovery of conserved regions in sequences from
	multiple closely related organisms. {W}e develop a formal probabilistic
	framework for combining phylogenetic shadowing with feature-based
	functional annotation methods. {T}he resulting model, a generalized
	hidden {M}arkov phylogeny ({GHMP}), applies to a variety of situations
	where functional regions are to be inferred from evolutionary constraints.
	{RESULTS}: {W}e show how {GHMP}s can be used to predict complete
	shared gene structures in multiple primate sequences. {W}e also describe
	shadower, our implementation of such a prediction system. {W}e find
	that shadower outperforms previously reported ab initio gene finders,
	including comparative human-mouse approaches, on a small sample of
	diverse exonic regions. {F}inally, we report on an empirical analysis
	of shadower's performance which reveals that as few as five well-chosen
	species may suffice to attain maximal sensitivity and specificity
	in exon demarcation. {AVAILABILITY}: {A} {W}eb server is available
	at http://bonaire.lbl.gov/shadower},
  doi = {10.1093/bioinformatics/bth153},
  pdf = {../local/McAuliffe2004Multiple-sequence.pdf},
  file = {McAuliffe2004Multiple-sequence.pdf:local/McAuliffe2004Multiple-sequence.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {bth153},
  pmid = {14988105},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth153}
}
@article{McKnight2003Categorization,
  author = {Larry McKnight and Padmini Srinivasan},
  title = {Categorization of sentence types in medical abstracts.},
  journal = {A{MIA} {A}nnu {S}ymp {P}roc},
  year = {2003},
  pages = {440-4},
  abstract = {This study evaluated the use of machine learning techniques in the
	classification of sentence type. 7253 structured abstracts and 204
	unstructured abstracts of {R}andomized {C}ontrolled {T}rials from
	{M}ed{LINE} were parsed into sentences and each sentence was labeled
	as one of four types ({I}ntroduction, {M}ethod, {R}esult, or {C}onclusion).
	{S}upport {V}ector {M}achine ({SVM}) and {L}inear {C}lassifier models
	were generated and evaluated on cross-validated data. {T}reating
	sentences as a simple "bag of words", the {SVM} model had an average
	{ROC} area of 0.92. {A}dding a feature of relative sentence location
	improved performance markedly for some models and overall increasing
	the average {ROC} to 0.95. {L}inear classifier performance was significantly
	worse than the {SVM} in all datasets. {U}sing the {SVM} model trained
	on structured abstracts to predict unstructured abstracts yielded
	performance similar to that of models trained with unstructured abstracts
	in 3 of the 4 types. {W}e conclude that classification of sentence
	type seems feasible within the domain of {RCT}'s. {I}dentification
	of sentence types may be helpful for providing context to end users
	or other text summarization techniques.},
  keywords = {biosvm},
  pii = {D030003164}
}
@article{Meinicke2004Oligo,
  author = {Meinicke, P. and Tech, M. and Morgenstern, B. and Merkl, R.},
  title = {Oligo kernels for datamining on biological sequences: a case study
	on prokaryotic translation initiation sites.},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {169},
  abstract = {Background {K}ernel-based learning algorithms are among the most advanced
	machine learning methods and have been successfully applied to a
	variety of sequence classification tasks within the field of bioinformatics.
	{C}onventional kernels utilized so far do not provide an easy interpretation
	of the learnt representations in terms of positional and compositional
	variability of the underlying biological signals. {R}esults {W}e
	propose a kernel-based approach to datamining on biological sequences.
	{W}ith our method it is possible to model and analyze positional
	variability of oligomers of any length in a natural way. {O}n one
	hand this is achieved by mapping the sequences to an intuitive but
	high-dimensional feature space, well-suited for interpretation of
	the learnt models. {O}n the other hand, by means of the kernel trick
	we can provide a general learning algorithm for that high-dimensional
	representation because all required statistics can be computed without
	performing an explicit feature space mapping of the sequences. {B}y
	introducing a kernel parameter that controls the degree of position-dependency,
	our feature space representation can be tailored to the characteristics
	of the biological problem at hand. {A} regularized learning scheme
	enables application even to biological problems for which only small
	sets of example sequences are available. {O}ur approach includes
	a visualization method for transparent representation of characteristic
	sequence features. {T}hereby importance of features can be measured
	in terms of discriminative strength with respect to classification
	of the underlying sequences. {T}o demonstrate and validate our concept
	on a biochemically well-defined case, we analyze {E}. coli translation
	initiation sites in order to show that we can find biologically relevant
	signals. {F}or that case, our results clearly show that the {S}hine-{D}algarno
	sequence is the most important signal upstream a start codon. {T}he
	variability in position and composition we found for that signal
	is in accordance with previous biological knowledge. {W}e also find
	evidence for signals downstream of the start codon, previously introduced
	as transcriptional enhancers. {T}hese signals are mainly characterized
	by occurrences of adenine in a region of about 4 nucleotides next
	to the start codon. {C}onclusions {W}e showed that the oligo kernel
	can provide a valuable tool for the analysis of relevant signals
	in biological sequences. {I}n the case of translation initiation
	sites we could clearly deduce the most discriminative motifs and
	their positional variation from example sequences. {A}ttractive features
	of our approach are its flexibility with respect to oligomer length
	and position conservation. {B}y means of these two parameters oligo
	kernels can easily be adapted to different biological problems.},
  doi = {10.1186/1471-2105-5-169},
  pdf = {../local/Meinicke2004Oligo.pdf},
  file = {Meinicke2004Oligo.pdf:local/Meinicke2004Oligo.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.biomedcentral.com/1471-2105/5/169}
}
@article{Meireles2003Differentially,
  author = {Meireles, S.I. and Carvalho, A.F. and Hirata, R. and Montagnini,
	A.L. and Martins, W.K. and Runza, F.B. and Stolf, B.S. and Termini,
	L. and Neto, C.E. and Silva, R.L. and Soares, F.A. and Neves, E.J.
	and Reis, L.F.},
  title = {Differentially expressed genes in gastric tumors identified by c{DNA}
	array.},
  journal = {Cancer {L}ett.},
  year = {2003},
  volume = {190},
  pages = {199-211},
  number = {2},
  month = {Feb},
  abstract = {Using c{DNA} fragments from the {FAPESP}/l{ICR} {C}ancer {G}enome
	{P}roject, we constructed a c{DNA} array having 4512 elements and
	determined gene expression in six normal and six tumor gastric tissues.
	{U}sing t-statistics, we identified 80 c{DNA}s whose expression in
	normal and tumor samples differed more than 3.5 sample standard deviations.
	{U}sing {S}elf-{O}rganizing {M}ap, the expression profile of these
	c{DNA}s allowed perfect separation of malignant and non-malignant
	samples. {U}sing the supervised learning procedure {S}upport {V}ector
	{M}achine, we identified trios of c{DNA}s that could be used to classify
	samples as normal or tumor, based on single-array analysis. {F}inally,
	we identified genes with altered linear correlation when their expression
	in normal and tumor samples were compared. {F}urther investigation
	concerning the function of these genes could contribute to the understanding
	of gastric carcinogenesis and may prove useful in molecular diagnostics.},
  doi = {10.1016/S0304-3835(02)00587},
  pdf = {../local/Meireles2003Differentially.pdf},
  file = {Meireles2003Differentially.pdf:local/Meireles2003Differentially.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0304-3835(02)00587-6}
}
@article{Mestres2004Computational,
  author = {Jordi Mestres},
  title = {Computational chemogenomics approaches to systematic knowledge-based
	drug discovery.},
  journal = {Curr Opin Drug Discov Devel},
  year = {2004},
  volume = {7},
  pages = {304--313},
  number = {3},
  month = {May},
  abstract = {Chemogenomics, the identification of all possible drugs for all possible
	targets, has recently emerged as a new paradigm in drug discovery
	in which efficiency in the compound design and optimization process
	is achieved through the gain and reuse of targeted knowledge. As
	targeted knowledge resides at the interface between chemistry and
	biology, computational tools aimed at integrating the chemical and
	biological spaces play a central role in chemogenomics. This review
	covers the recent progress made in integrative computational approaches
	to data annotation and knowledge generation for the systematic knowledge-based
	design and screening of chemical libraries.},
  keywords = {Chemistry, Pharmaceutical; Combinatorial Chemistry Techniques; Computational
	Biology; Drug Design; Genomics; Ligands; Proteins; Receptors, G-Protein-Coupled},
  owner = {vert},
  pmid = {15216933},
  timestamp = {2007.08.02}
}
@article{Middendorf2004Discriminative,
  author = {Middendorf, M. and Ziv, E. and Adams, C. and Hom, J. and Koytcheff,
	R. and Levovitz, C. and Woods, G. and Chen, L. and Wiggins, C.},
  title = {Discriminative topological features reveal biological network mechanisms.},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {181},
  abstract = {B{ACKGROUND}: {R}ecent genomic and bioinformatic advances have motivated
	the development of numerous network models intending to describe
	graphs of biological, technological, and sociological origin. {I}n
	most cases the success of a model has been evaluated by how well
	it reproduces a few key features of the real-world data, such as
	degree distributions, mean geodesic lengths, and clustering coefficients.
	{O}ften pairs of models can reproduce these features with indistinguishable
	fidelity despite being generated by vastly different mechanisms.
	{I}n such cases, these few target features are insufficient to distinguish
	which of the different models best describes real world networks
	of interest; moreover, it is not clear a priori that any of the presently-existing
	algorithms for network generation offers a predictive description
	of the networks inspiring them. {RESULTS}: {W}e present a method
	to assess systematically which of a set of proposed network generation
	algorithms gives the most accurate description of a given biological
	network. {T}o derive discriminative classifiers, we construct a mapping
	from the set of all graphs to a high-dimensional (in principle infinite-dimensional)
	"word space". {T}his map defines an input space for classification
	schemes which allow us to state unambiguously which models are most
	descriptive of a given network of interest. {O}ur training sets include
	networks generated from 17 models either drawn from the literature
	or introduced in this work. {W}e show that different duplication-mutation
	schemes best describe the {E}. coli genetic network, the {S}. cerevisiae
	protein interaction network, and the {C}. elegans neuronal network,
	out of a set of network models including a linear preferential attachment
	model and a small-world model. {CONCLUSIONS}: {O}ur method is a first
	step towards systematizing network models and assessing their predictability,
	and we anticipate its usefulness for a number of communities.},
  doi = {10.1186/1471-2105-5-181},
  pdf = {../local/Middendorf2004Discriminative.pdf},
  file = {Middendorf2004Discriminative.pdf:local/Middendorf2004Discriminative.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.biomedcentral.com/1471-2105/5/181}
}
@article{Mika2004NLProt,
  author = {Sven Mika and Burkhard Rost},
  title = {N{LP}rot: extracting protein names and sequences from papers.},
  journal = {Nucleic {A}cids {R}es},
  year = {2004},
  volume = {32},
  pages = {W634-7},
  number = {Web Server issue},
  month = {Jul},
  abstract = {Automatically extracting protein names from the literature and linking
	these names to the associated entries in sequence databases is becoming
	increasingly important for annotating biological databases. {NLP}rot
	is a novel system that combines dictionary- and rule-based filtering
	with several support vector machines ({SVM}s) to tag protein names
	in {P}ub{M}ed abstracts. {W}hen considering partially tagged names
	as errors, {NLP}rot still reached a precision of 75\% at a recall
	of 76\%. {B}y many criteria our system outperformed other tagging
	methods significantly; in particular, it proved very reliable even
	for novel names. {N}ames encountered particularly frequently in {D}rosophila,
	such as white, wing and bizarre, constitute an obvious limitation
	of {NLP}rot. {O}ur method is available both as an {I}nternet server
	and as a program for download (http://cubic.bioc.columbia.edu/services/{NLP}rot/).
	{I}nput can be {P}ub{M}ed/{MEDLINE} identifiers, authors, titles
	and journals, as well as collections of abstracts, or entire papers.},
  doi = {10.1093/nar/gkh427},
  keywords = {biosvm nlp},
  pii = {32/suppl_2/W634},
  url = {http://dx.doi.org/10.1093/nar/gkh427}
}
@article{Mika2004Protein,
  author = {Mika, Sven and Rost, Burkhard},
  title = {Protein names precisely peeled off free text},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {i241-i247},
  number = {Suppl. 1},
  abstract = {Motivation: {A}utomatically identifying protein names from the scientific
	literature is a pre-requisite for the increasing demand in data-mining
	this wealth of information. {E}xisting approaches are based on dictionaries,
	rules and machine-learning. {H}ere, we introduced a novel system
	that combines a pre-processing dictionary- and rule-based filtering
	step with several separately trained support vector machines ({SVM}s)
	to identify protein names in the {MEDLINE} abstracts. {R}esults:
	{O}ur new tagging-system {NLP}rot is capable of extracting protein
	names with a precision (accuracy) of 75% at a recall (coverage) of
	76% after training on a corpus, which was used before by other groups
	and contains 200 annotated abstracts. {F}or our estimate of sustained
	performance, we considered partially identified names as false positives.
	{O}ne important issue frequently ignored in the literature is the
	redundancy in evaluation sets. {W}e suggested some guidelines for
	removing overly inadequate overlaps between training and testing
	sets. {A}pplying these new guidelines, our program appeared to significantly
	out-perform other methods tagging protein names. {NLP}rot was so
	successful due to the {SVM}-building blocks that succeeded in utilizing
	the local context of protein names in the scientific literature.
	{W}e challenge that our system may constitute the most general and
	precise method for tagging protein names. {A}vailability: http://cubic.bioc.columbia.edu/services/nlprot/},
  pdf = {../local/Mika2004Protein.pdf},
  file = {Mika2004Protein.pdf:Mika2004Protein.pdf:PDF},
  keywords = {biosvm nlp},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i241}
}
@article{Mishra2008Review,
  author = {K. P. Mishra and L. Ganju and M. Sairam and P. K. Banerjee and R.
	C. Sawhney},
  title = {A review of high throughput technology for the screening of natural
	products.},
  journal = {Biomed Pharmacother},
  year = {2008},
  volume = {62},
  pages = {94--98},
  number = {2},
  month = {Feb},
  abstract = {High throughput screening is commonly defined as automatic testing
	of potential drug candidates at a rate in excess of 10,000 compounds
	per week. The aim of high throughput drug discovery is to test large
	compound collections for potentially active compounds ('hits') in
	order to allow further development of compounds for pre-clinical
	testing ('leads'). High throughput technology has emerged over the
	last few years as an important tool for drug discovery and lead optimisation.
	In this approach, the molecular diversity and range of biological
	properties displayed by secondary metabolites constitutes a challenge
	to combinatorial strategies for natural products synthesis and derivatization.
	This article reviews the approach of High throughput technique for
	the screening of natural products for drug discovery.},
  doi = {10.1016/j.biopha.2007.06.012},
  institution = {Defence Institute of Physiology and Allied Sciences, Lucknow Road,
	Timarpur, Delhi 110054, India.},
  keywords = {Automation; Biological Products, pharmacology; Combinatorial Chemistry
	Techniques; Drug Design; Drug Evaluation, Preclinical; Technology,
	Pharmaceutical, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {S0753-3322(07)00127-8},
  pmid = {17692498},
  timestamp = {2010.07.26},
  url = {http://dx.doi.org/10.1016/j.biopha.2007.06.012}
}
@article{Mitsumori2005Gene,
  author = {Tomohiro Mitsumori and Sevrani Fation and Masaki Murata and Kouichi
	Doi and Hirohumi Doi},
  title = {Gene/protein name recognition based on support vector machine using
	dictionary as features.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6 Suppl 1},
  pages = {S8},
  abstract = {B{ACKGROUND}: {A}utomated information extraction from biomedical literature
	is important because a vast amount of biomedical literature has been
	published. {R}ecognition of the biomedical named entities is the
	first step in information extraction. {W}e developed an automated
	recognition system based on the {SVM} algorithm and evaluated it
	in {T}ask 1.{A} of {B}io{C}re{A}t{I}v{E}, a competition for automated
	gene/protein name recognition. {RESULTS}: {I}n the work presented
	here, our recognition system uses the feature set of the word, the
	part-of-speech ({POS}), the orthography, the prefix, the suffix,
	and the preceding class. {W}e call these features "internal resource
	features", i.e., features that can be found in the training data.
	{A}dditionally, we consider the features of matching against dictionaries
	to be external resource features. {W}e investigated and evaluated
	the effect of these features as well as the effect of tuning the
	parameters of the {SVM} algorithm. {W}e found that the dictionary
	matching features contributed slightly to the improvement in the
	performance of the f-score. {W}e attribute this to the possibility
	that the dictionary matching features might overlap with other features
	in the current multiple feature setting. {CONCLUSION}: {D}uring {SVM}
	learning, each feature alone had a marginally positive effect on
	system performance. {T}his supports the fact that the {SVM} algorithm
	is robust on the high dimensionality of the feature vector space
	and means that feature selection is not required.},
  doi = {10.1186/1471-2105-6-S1-S8},
  pdf = {../local/Mitsumori2005Gene.pdf},
  file = {Mitsumori2005Gene.pdf:local/Mitsumori2005Gene.pdf:PDF},
  keywords = {biosvm nlp},
  pii = {1471-2105-6-S1-S8},
  url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S8}
}
@article{Miwakeichi2001comparison,
  author = {F. Miwakeichi and R. Ramirez-Padron and P. A. Valdes-Sosa and T.
	Ozaki},
  title = {A comparison of non-linear non-parametric models for epilepsy data.},
  journal = {Comput. {B}iol. {M}ed.},
  year = {2001},
  volume = {31},
  pages = {41-57},
  number = {1},
  month = {Jan},
  abstract = {E{EG} spike and wave ({SW}) activity has been described through a
	non-parametric stochastic model estimated by the {N}adaraya-{W}atson
	({NW}) method. {I}n this paper the performance of the {NW}, the local
	linear polynomial regression and support vector machines ({SVM})
	methods were compared. {T}he noise-free realizations obtained by
	the {NW} and {SVM} methods reproduced {SW} better than as reported
	in previous works. {T}he tuning parameters had to be estimated manually.
	{A}dding dynamical noise, only the {NW} method was capable of generating
	{SW} similar to training data. {T}he standard deviation of the dynamical
	noise was estimated by means of the correlation dimension.},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Animals, Artificial Intelligence, Automated, B-Lymphocytes, Bacterial
	Proteins, Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding
	Sites, Biological, Bone Marrow Cells, Brachyura, Cell Compartmentation,
	Chemistry, Child, Chromosome Aberrations, Classification, Codon,
	Colonic Neoplasms, Comparative Study, Computational Biology, Computer
	Simulation, Computer-Assisted, DNA, Data Interpretation, Databases,
	Decision Trees, Diabetes Mellitus, Diagnosis, Discriminant Analysis,
	Discrimination Learning, Electric Conductivity, Electroencephalography,
	Electrophysiology, Epilepsy, Escherichia coli Proteins, Factual,
	Feedback, Female, Fungal, Gastric Emptying, Gene Expression Profiling,
	Gene Expression Regulation, Genes, Genetic, Genetic Markers, Genetic
	Predisposition to Disease, Genomics, Hemolysins, Humans, Indians,
	Information Storage and Retrieval, Initiator, Ion Channels, Kinetics,
	Leukemia, Likelihood Functions, Linear Models, Lipid Bilayers, Logistic
	Models, Lymphocytic, MEDLINE, Male, Markov Chains, Melanoma, Models,
	Molecular, Myeloid, Neoplasm, Neoplasms, Neoplastic, Neural Networks
	(Computer), Neurological, Nevus, Non-P.H.S., Non-U.S. Gov't, Nonlinear
	Dynamics, Normal Distribution, North American, Nucleic Acid Conformation,
	Oligonucleotide Array Sequence Analysis, Organ Specificity, Organelles,
	Ovarian Neoplasms, Ovary, P.H.S., Pattern Recognition, Physical,
	Pigmented, Predictive Value of Tests, Promoter Regions (Genetics),
	Protein Biosynthesis, Protein Folding, Protein Structure, Proteins,
	Proteome, RNA, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Secondary, Sensitivity and Specificity, Sequence Alignment,
	Sequence Analysis, Sex Characteristics, Skin Diseases, Skin Neoplasms,
	Skin Pigmentation, Software, Sound Spectrography, Statistical, Stochastic
	Processes, Stomach Diseases, T-Lymphocytes, Thermodynamics, Transcription,
	Transcription Factors, Tumor Markers, Type 2, U.S. Gov't, Vertebrates,
	11058693},
  pii = {S0010482500000214}
}
@article{Model2001Feature,
  author = {Model, F. and Adorjan, P. and Olek, A. and Piepenbrock, C.},
  title = {Feature selection for {DNA} methylation based cancer classification},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {S157-S164},
  number = {Supp. 1},
  abstract = {Molecular portraits, such as m{RNA} expression or {DNA} methylation
	patterns, have been shown to be strongly correlated with phenotypical
	parameters. {T}hese molecular patterns can be revealed routinely
	on a genomic scale. {H}owever, class prediction based on these patterns
	is an under-determined problem, due to the extreme high dimensionality
	of the data compared to the usually small number of available samples.
	{T}his makes a reduction of the data dimensionality necessary. {H}ere
	we demonstrate how phenotypic classes can be predicted by combining
	feature selection and discriminant analysis. {B}y comparing several
	feature selection methods we show that the right dimension reduction
	strategy is of crucial importance for the classification performance.
	{T}he techniques are demonstrated by methylation pattern based discrimination
	between acute lymphoblastic leukemia and acute myeloid leukemia.
	{C}ontact: {F}abian.{M}odel@epigenomics.com},
  pdf = {../local/Model2001Feature.pdf},
  file = {Model2001Feature.pdf:local/Model2001Feature.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/suppl_1/S157}
}
@article{Moler2000Analysis,
  author = {Moler, E. J. and Chow, M. L. and Mian, I. S.},
  title = {Analysis of molecular profile data using generative and discriminative
	methods},
  journal = {Physiol. {G}enomics},
  year = {2000},
  volume = {4},
  pages = {109-126},
  number = {2},
  month = {Dec},
  abstract = {A modular framework is proposed for modeling and understanding the
	relationships between molecular profile data and other domain knowledge
	using a combination of generative (here, graphical models) and discriminative
	[{S}upport {V}ector {M}achines ({SVM}s)] methods. {A}s illustration,
	naive {B}ayes models, simple graphical models, and {SVM}s were applied
	to published transcription profile data for 1,988 genes in 62 colon
	adenocarcinoma tissue specimens labeled as tumor or nontumor. {T}hese
	unsupervised and supervised learning methods identified three classes
	or subtypes of specimens, assigned tumor or nontumor labels to new
	specimens and detected six potentially mislabeled specimens. {T}he
	probability parameters of the three classes were utilized to develop
	a novel gene relevance, ranking, and selection method. {SVM}s trained
	to discriminate nontumor from tumor specimens using only the 50-200
	top-ranked genes had the same or better generalization performance
	than the full repertoire of 1,988 genes. {A}pproximately 90 marker
	genes were pinpointed for use in understanding the basic biology
	of colon adenocarcinoma, defining targets for therapeutic intervention
	and developing diagnostic tools. {T}hese potential markers highlight
	the importance of tissue biology in the etiology of cancer. {C}omparative
	analysis of molecular profile data is proposed as a mechanism for
	predicting the physiological function of genes in instances when
	comparative sequence analysis proves uninformative, such as with
	human and yeast translationally controlled tumour protein. {G}raphical
	models and {SVM}s hold promise as the foundations for developing
	decision support systems for diagnosis, prognosis, and monitoring
	as well as inferring biological networks.},
  pdf = {../local/Moler2000Analysis.pdf},
  file = {Moler2000Analysis.pdf:local/Moler2000Analysis.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://physiolgenomics.physiology.org/cgi/content/abstract/4/2/109}
}
@techreport{Mukherjee1998Support,
  author = {S. Mukherjee and P. Tamayo and J. P. Mesirov and D. Slonim and A.
	Verri and T. Poggio},
  title = {Support vector machine classification of microarray data},
  institution = {C.B.L.C.},
  year = {1998},
  number = {182},
  note = {A.I. Memo 1677},
  pdf = {../local/Mukherjee1998Support.pdf},
  file = {Mukherjee1998Support.pdf:local/Mukherjee1998Support.pdf:PDF},
  keywords = {biosvm microarray},
  subject = {biokernel},
  url = {http://citeseer.nj.nec.com/437379.html}
}
@techreport{Murphy1999Modelling,
  author = {Murphy, K. and Mian, S.},
  title = {Modelling gene expression data using dynamic {B}ayesian networks},
  institution = {Computer Science Division, University of California, Berkeley, CA.},
  year = {1999},
  pdf = {../local/Murphy1999Modelling.pdf},
  file = {Murphy1999Modelling.pdf:local/Murphy1999Modelling.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  timestamp = {2006.01.18}
}
@article{Myasnikova2002Support,
  author = {Myasnikova, E. and Samsonova, A. and Samsonova, M. and Reinitz, J.},
  title = {Support vector regression applied to the determination of the developmental
	age of a {D}rosophila embryo from its segmentation gene expression
	patterns},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {S87-S95},
  number = {Suppl. 1},
  abstract = {Motivation: {I}n this paper we address the problem of the determination
	of developmental age of an embryo from its segmentation gene expression
	patterns in {D}rosophila. {R}esults: {B}y applying support vector
	regression we have developed a fast method for automated staging
	of an embryo on the basis of its gene expression pattern. {S}upport
	vector regression is a statistical method for creating regression
	functions of arbitrary type from a set of training data. {T}he training
	set is composed of embryos for which the precise developmental age
	was determined by measuring the degree of membrane invagination.
	{T}esting the quality of regression on the training set showed good
	prediction accuracy. {T}he optimal regression function was then used
	for the prediction of the gene expression based age of embryos in
	which the precise age has not been measured by membrane morphology.
	{M}oreover, we show that the same accuracy of prediction can be achieved
	when the dimensionality of the feature vector was reduced by applying
	factor analysis. {T}he data reduction allowed us to avoid over-fitting
	and to increase the efficiency of the algorithm. {A}vailability:
	{T}his software may be obtained from the authors. {C}ontact: samson@fn.csa.ru
	{K}eywords: gene expression patterns; development; embryo staging;
	support vector regression; segmentation genes; {D}rosophila.},
  pdf = {../local/Myasnikova2002Support.pdf},
  file = {Myasnikova2002Support.pdf:local/Myasnikova2002Support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/suppl_1/S87}
}
@article{Mueller2005Classifying,
  author = {K.-R. M{\"u}ller and G. R{\"a}tsch and S. Sonnenburg and S. Mika
	and M. Grimm and N. Heinrich},
  title = {Classifying 'drug-likeness' with {K}ernel-based learning methods.},
  journal = {J {C}hem {I}nf {M}odel},
  year = {2005},
  volume = {45},
  pages = {249-53},
  number = {2},
  abstract = {In this article we report about a successful application of modern
	machine learning technology, namely {S}upport {V}ector {M}achines,
	to the problem of assessing the 'drug-likeness' of a chemical from
	a given set of descriptors of the substance. {W}e were able to drastically
	improve the recent result by {B}yvatov et al. (2003) on this task
	and achieved an error rate of about 7\% on unseen compounds using
	{S}upport {V}ector {M}achines. {W}e see a very high potential of
	such machine learning techniques for a variety of computational chemistry
	problems that occur in the drug discovery and drug design process.},
  doi = {10.1021/ci049737o},
  pdf = {../local/Mueller2005Classifying.pdf},
  file = {Mueller2005Classifying.pdf:local/Mueller2005Classifying.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci049737o}
}
@article{Nabieva2005Whole-proteome,
  author = {Elena Nabieva and Kam Jim and Amit Agarwal and Bernard Chazelle and
	Mona Singh},
  title = {Whole-proteome prediction of protein function via graph-theoretic
	analysis of interaction maps.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21 Suppl 1},
  pages = {i302--i310},
  month = {Jun},
  abstract = {MOTIVATION: Determining protein function is one of the most important
	problems in the post-genomic era. For the typical proteome, there
	are no functional annotations for one-third or more of its proteins.
	Recent high-throughput experiments have determined proteome-scale
	protein physical interaction maps for several organisms. These physical
	interactions are complemented by an abundance of data about other
	types of functional relationships between proteins, including genetic
	interactions, knowledge about co-expression and shared evolutionary
	history. Taken together, these pairwise linkages can be used to build
	whole-proteome protein interaction maps. RESULTS: We develop a network-flow
	based algorithm, FunctionalFlow, that exploits the underlying structure
	of protein interaction maps in order to predict protein function.
	In cross-validation testing on the yeast proteome, we show that FunctionalFlow
	has improved performance over previous methods in predicting the
	function of proteins with few (or no) annotated protein neighbors.
	By comparing several methods that use protein interaction maps to
	predict protein function, we demonstrate that FunctionalFlow performs
	well because it takes advantage of both network topology and some
	measure of locality. Finally, we show that performance can be improved
	substantially as we consider multiple data sources and use them to
	create weighted interaction networks. AVAILABILITY: http://compbio.cs.princeton.edu/function},
  doi = {10.1093/bioinformatics/bti1054},
  institution = {Computer Science Department, Princeton University Princeton, NJ 08544,
	USA.},
  keywords = {Algorithms; Computational Biology, methods; Evolution, Molecular;
	Fungal Proteins, chemistry; Genomics; Models, Statistical; Models,
	Theoretical; Protein Interaction Mapping, methods; Proteins, chemistry;
	Proteomics, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pii = {21/suppl_1/i302},
  pmid = {15961472},
  timestamp = {2010.04.03},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1054}
}
@article{Nair2005Mimicking,
  author = {Rajesh Nair and Burkhard Rost},
  title = {Mimicking cellular sorting improves prediction of subcellular localization.},
  journal = {J {M}ol {B}iol},
  year = {2005},
  volume = {348},
  pages = {85-100},
  number = {1},
  month = {Apr},
  abstract = {Predicting the native subcellular compartment of a protein is an important
	step toward elucidating its function. {H}ere we introduce {LOC}tree,
	a hierarchical system combining support vector machines ({SVM}s)
	and other prediction methods. {LOC}tree predicts the subcellular
	compartment of a protein by mimicking the mechanism of cellular sorting
	and exploiting a variety of sequence and predicted structural features
	in its input. {C}urrently {LOC}tree does not predict localization
	for membrane proteins, since the compositional properties of membrane
	proteins significantly differ from those of non-membrane proteins.
	{W}hile any information about function can be used by the system,
	we present estimates of performance that are valid when only the
	amino acid sequence of a protein is known. {W}hen evaluated on a
	non-redundant test set, {LOC}tree achieved sustained levels of 74\%
	accuracy for non-plant eukaryotes, 70\% for plants, and 84\% for
	prokaryotes. {W}e rigorously benchmarked {LOC}tree in comparison
	to the best alternative methods for localization prediction. {LOC}tree
	outperformed all other methods in nearly all benchmarks. {L}ocalization
	assignments using {LOC}tree agreed quite well with data from recent
	large-scale experiments. {O}ur preliminary analysis of a few entirely
	sequenced organisms, namely human ({H}omo sapiens), yeast ({S}accharomyces
	cerevisiae), and weed ({A}rabidopsis thaliana) suggested that over
	35\% of all non-membrane proteins are nuclear, about 20\% are retained
	in the cytosol, and that every fifth protein in the weed resides
	in the chloroplast.},
  doi = {10.1016/j.jmb.2005.02.025},
  pdf = {../local/Nair2005Mimicking.pdf},
  file = {Nair2005Mimicking.pdf:local/Nair2005Mimicking.pdf:PDF},
  keywords = {biosvm},
  pii = {S0022-2836(05)00177-4},
  url = {http://dx.doi.org/10.1016/j.jmb.2005.02.025}
}
@article{Natt2004Prediction,
  author = {Natt, N.K. and Kaur, H. and Raghava, G.P.},
  title = {Prediction of transmembrane regions of beta-barrel proteins using
	{ANN}- and {SVM}-based methods.},
  journal = {Proteins},
  year = {2004},
  volume = {56},
  pages = {11-18},
  number = {1},
  abstract = {This article describes a method developed for predicting transmembrane
	beta-barrel regions in membrane proteins using machine learning techniques:
	artificial neural network ({ANN}) and support vector machine ({SVM}).
	{T}he {ANN} used in this study is a feed-forward neural network with
	a standard back-propagation training algorithm. {T}he accuracy of
	the {ANN}-based method improved significantly, from 70.4% to 80.5%,
	when evolutionary information was added to a single sequence as a
	multiple sequence alignment obtained from {PSI}-{BLAST}. {W}e have
	also developed an {SVM}-based method using a primary sequence as
	input and achieved an accuracy of 77.4%. {T}he {SVM} model was modified
	by adding 36 physicochemical parameters to the amino acid sequence
	information. {F}inally, {ANN}- and {SVM}-based methods were combined
	to utilize the full potential of both techniques. {T}he accuracy
	and {M}atthews correlation coefficient ({MCC}) value of {SVM}, {ANN},
	and combined method are 78.5%, 80.5%, and 81.8%, and 0.55, 0.63,
	and 0.64, respectively. {T}hese methods were trained and tested on
	a nonredundant data set of 16 proteins, and performance was evaluated
	using "leave one out cross-validation" ({LOOCV}). {B}ased on this
	study, we have developed a {W}eb server, {TBBP}red, for predicting
	transmembrane beta-barrel regions in proteins (available at http://www.imtech.res.in/raghava/tbbpred).},
  doi = {10.1002/prot.20092},
  pdf = {../local/Natt2004Prediction.pdf},
  file = {Natt2004Prediction.pdf:local/Natt2004Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.20092}
}
@article{Nguyen2005Prediction,
  author = {Minh N Nguyen and Jagath C Rajapakse},
  title = {Prediction of protein relative solvent accessibility with a two-stage
	{SVM} approach.},
  journal = {Proteins},
  year = {2005},
  volume = {59},
  pages = {30-7},
  number = {1},
  month = {Apr},
  abstract = {Information on relative solvent accessibility ({RSA}) of amino acid
	residues in proteins provides valuable clues to the prediction of
	protein structure and function. {A} two-stage approach with support
	vector machines ({SVM}s) is proposed, where an {SVM} predictor is
	introduced to the output of the single-stage {SVM} approach to take
	into account the contextual relationships among solvent accessibilities
	for the prediction. {B}y using the position-specific scoring matrices
	({PSSM}s) generated by {PSI}-{BLAST}, the two-stage {SVM} approach
	achieves accuracies up to 90.4\% and 90.2\% on the {M}anesh data
	set of 215 protein structures and the {RS}126 data set of 126 nonhomologous
	globular proteins, respectively, which are better than the highest
	published scores on both data sets to date. {A} {W}eb server for
	protein {RSA} prediction using a two-stage {SVM} method has been
	developed and is available (http://birc.ntu.edu.sg/~pas0186457/rsa.html).},
  doi = {10.1002/prot.20404},
  pdf = {../local/Nguyen2005Prediction.pdf},
  file = {Nguyen2005Prediction.pdf:local/Nguyen2005Prediction.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1002/prot.20404}
}
@article{Nguyen2005Two-stage,
  author = {M. N. Nguyen and J. C. Rajapakse},
  title = {Two-stage multi-class support vector machines to protein secondary
	structure prediction.},
  journal = {Pac {S}ymp {B}iocomput},
  year = {2005},
  pages = {346-57},
  abstract = {Bioinformatics techniques to protein secondary structure ({PSS}) prediction
	are mostly single-stage approaches in the sense that they predict
	secondary structures of proteins by taking into account only the
	contextual information in amino acid sequences. {I}n this paper,
	we propose two-stage {M}ulti-class {S}upport {V}ector {M}achine ({MSVM})
	approach where a {MSVM} predictor is introduced to the output of
	the first stage {MSVM} to capture the sequential relationship among
	secondary structure elements for the prediction. {B}y using position
	specific scoring matrices, generated by {PSI}-{BLAST}, the two-stage
	{MSVM} approach achieves {Q}3 accuracies of 78.0\% and 76.3\% on
	the {RS}126 dataset of 126 nonhomologous globular proteins and the
	{CB}396 dataset of 396 nonhomologous proteins, respectively, which
	are better than the highest scores published on both datasets to
	date.},
  keywords = {biosvm}
}
@article{Nguyen2003Multi-class,
  author = {Minh N Nguyen and Jagath C Rajapakse},
  title = {Multi-class support vector machines for protein secondary structure
	prediction.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2003},
  volume = {14},
  pages = {218-27},
  abstract = {The solution of binary classification problems using the {S}upport
	{V}ector {M}achine ({SVM}) method has been well developed. {T}hough
	multi-class classification is typically solved by combining several
	binary classifiers, recently, several multi-class methods that consider
	all classes at once have been proposed. {H}owever, these methods
	require resolving a much larger optimization problem and are applicable
	to small datasets. {T}hree methods based on binary classifications:
	one-against-all ({OAA}), one-against-one ({OAO}), and directed acyclic
	graph ({DAG}), and two approaches for multi-class problem by solving
	one single optimization problem, are implemented to predict protein
	secondary structure. {O}ur experiments indicate that multi-class
	{SVM} methods are more suitable for protein secondary structure ({PSS})
	prediction than the other methods, including binary {SVM}s, because
	their capacity to solve an optimization problem in one step. {F}urthermore,
	in this paper, we argue that it is feasible to extend the prediction
	accuracy by adding a second-stage multi-class {SVM} to capture the
	contextual information among secondary structural elements and thereby
	further improving the accuracies. {W}e demonstrate that two-stage
	{SVM}s perform better than single-stage {SVM} techniques for {PSS}
	prediction using two datasets and report a maximum accuracy of 79.5\%.},
  keywords = {biosvm}
}
@article{Nielsen1997Identification,
  author = {Nielsen, H. and Engelbrecht, J. and Brunak, S. and von Heijne, G.},
  title = {Identification of prokaryotic and eukaryotic signal peptides and
	prediction of their cleavage sites},
  journal = {Protein {E}ng.},
  year = {1997},
  volume = {10},
  pages = {1--6},
  number = {1},
  pdf = {../local/niel97.pdf},
  file = {niel97.pdf:local/niel97.pdf:PDF},
  subject = {bioprot},
  url = {http://protein.oupjournals.org/cgi/content/abstract/10/1/1}
}
@incollection{Noble2004Support,
  author = {Noble, W. S.},
  title = {Support vector machine applications in computational biology},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {71-92},
  abstract = {During the past three years, the support vector machine learning algorithm
	has been extensively applied within the field of computational biology.
	{T}he algorithm has been used to detect patterns within and among
	biological sequences, to classify genes and patients based upon gene
	expression profiles, and has recently been applied to several new
	biological problems. {T}his chapter reviews the state of the art
	with respect to {SVM} applications in computational biology.},
  pdf = {../local/Noble2004Support.pdf},
  file = {Noble2004Support.pdf:local/Noble2004Support.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{ODonnell2005Gene,
  author = {Rebekah K O'Donnell and Michael Kupferman and S. Jack Wei and Sunil
	Singhal and Randal Weber and Bert O'Malley and Yi Cheng and Mary
	Putt and Michael Feldman and Barry Ziober and Ruth J Muschel},
  title = {Gene expression signature predicts lymphatic metastasis in squamous
	cell carcinoma of the oral cavity.},
  journal = {Oncogene},
  year = {2005},
  volume = {24},
  pages = {1244-51},
  number = {7},
  month = {Feb},
  abstract = {Metastasis via the lymphatics is a major risk factor in squamous cell
	carcinoma of the oral cavity ({OSCC}). {W}e sought to determine whether
	the presence of metastasis in the regional lymph node could be predicted
	by a gene expression signature of the primary tumor. {A} total of
	18 {OSCC}s were characterized for gene expression by hybridizing
	{RNA} to {A}ffymetrix {U}133{A} gene chips. {G}enes with differential
	expression were identified using a permutation technique and verified
	by quantitative {RT}-{PCR} and immunohistochemistry. {A} predictive
	rule was built using a support vector machine, and the accuracy of
	the rule was evaluated using crossvalidation on the original data
	set and prediction of an independent set of four patients. {M}etastatic
	primary tumors could be differentiated from nonmetastatic primary
	tumors by a signature gene set of 116 genes. {T}his signature gene
	set correctly predicted the four independent patients as well as
	associating five lymph node metastases from the original patient
	set with the metastatic primary tumor group. {W}e concluded that
	lymph node metastasis could be predicted by gene expression profiles
	of primary oral cavity squamous cell carcinomas. {T}he presence of
	a gene expression signature for lymph node metastasis indicates that
	clinical testing to assess risk for lymph node metastasis should
	be possible.},
  doi = {10.1038/sj.onc.1208285},
  pdf = {../local/O'Donnell2005Gene.pdf},
  file = {O'Donnell2005Gene.pdf:local/O'Donnell2005Gene.pdf:PDF},
  keywords = {biosvm microarray},
  pii = {1208285},
  url = {http://dx.doi.org/10.1038/sj.onc.1208285}
}
@article{OFlanagan2005Non,
  author = {R. A. O'Flanagan and G. Paillard and R. Lavery and A. M. Sengupta},
  title = {Non-additivity in protein-{DNA} binding.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2254-63},
  number = {10},
  month = {May},
  abstract = {M{OTIVATION}: {L}ocalizing protein binding sites within genomic {DNA}
	is of considerable importance, but remains difficult for protein
	families, such as transcription factors, which have loosely defined
	target sequences. {I}t is generally assumed that protein affinity
	for {DNA} involves additive contributions from successive nucleotide
	pairs within the target sequence. {T}his is not necessarily true,
	and non-additive effects have already been experimentally demonstrated
	in a small number of cases. {T}he principal origin of non-additivity
	involves the so-called indirect component of protein-{DNA} recognition
	which is related to the sequence dependence of {DNA} deformation
	induced during complex formation. {N}on-additive effects are difficult
	to study because they require the identification of many more binding
	sequences than are normally necessary for describing additive specificity
	(typically via the construction of weight matrices). {RESULTS}: {I}n
	the present work we will use theoretically estimated binding energies
	as a basis for overcoming this problem. {O}ur approach enables us
	to study the full combinatorial set of sequences for a variety of
	{DNA}-binding proteins, make a detailed analysis of non-additive
	effects and exploit this information to improve binding site predictions
	using either weight matrices or support vector machines. {T}he results
	underline the fact that, even in the presence of significant deformation,
	non-additive effects may involve only a limited number of dinucleotide
	steps. {T}his information helps to reduce the number of binding sites
	which need to be identified for successful predictions and to avoid
	problems of over-fitting. {AVAILABILITY}: {T}he {SVM} software is
	available upon request from the authors.},
  doi = {10.1093/bioinformatics/bti361},
  pdf = {../local/OFlanagan2005Non.pdf},
  file = {OFlanagan2005Non.pdf:local/OFlanagan2005Non.pdf:PDF},
  keywords = {biosvm},
  pii = {bti361},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti361}
}
@article{Oloff2006Chemometric,
  author = {Scott Oloff and Shuxing Zhang and Nagamani Sukumar and Curt Breneman
	and Alexander Tropsha},
  title = {Chemometric analysis of ligand receptor complementarity: identifying
	Complementary Ligands Based on Receptor Information (CoLiBRI).},
  journal = {J. Chem. Inf. Model.},
  year = {2006},
  volume = {46},
  pages = {844--851},
  number = {2},
  abstract = {We have developed a novel structure-based chemoinformatics approach
	to search for Complimentary Ligands Based on Receptor Information
	(CoLiBRI). CoLiBRI is based on the representation of both receptor
	binding sites and their respective ligands in a space of universal
	chemical descriptors. The binding site atoms involved in the interaction
	with ligands are identified by the means of a computational geometry
	technique known as Delaunay tessellation as applied to X-ray characterized
	ligand-receptor complexes. TAE/RECON multiple chemical descriptors
	are calculated independently for each ligand as well as for its active
	site atoms. The representation of both ligands and active sites using
	chemical descriptors allows the application of well-known chemometric
	techniques in order to correlate chemical similarities between active
	sites and their respective ligands. We have established a protocol
	to map patterns of nearest neighbor active site vectors in a multidimensional
	TAE/RECON space onto those of their complementary ligands and vice
	versa. This protocol affords the prediction of a virtual complementary
	ligand vector in the ligand chemical space from the position of a
	known active site vector. This prediction is followed by chemical
	similarity calculations between this virtual ligand vector and those
	calculated for molecules in a chemical database to identify real
	compounds most similar to the virtual ligand. Consequently, the knowledge
	of the receptor active site structure affords straightforward and
	efficient identification of its complementary ligands in large databases
	of chemical compounds using rapid chemical similarity searches. Conversely,
	starting from the ligand chemical structure, one may identify possible
	complementary receptor cavities as well. We have applied the CoLiBRI
	approach to a data set of 800 X-ray characterized ligand-receptor
	complexes in the PDBbind database. Using a k nearest neighbor (kNN)
	pattern recognition approach and variable selection, we have shown
	that knowledge of the active site structure affords identification
	of its complimentary ligand among the top 1\% of a large chemical
	database in over 90\% of all test active sites when a binding site
	of the same protein family was present in the training set. In the
	case where test receptors are highly dissimilar and not present among
	the receptor families in the training set, the prediction accuracy
	is decreased; however, CoLiBRI was still able to quickly eliminate
	75\% of the chemical database as improbable ligands. CoLiBRI affords
	rapid prefiltering of a large chemical database to eliminate compounds
	that have little chance of binding to a receptor active site.},
  doi = {10.1021/ci050065r},
  keywords = {Algorithms; Binding Sites; Binding, Competitive; Computational Biology;
	Databases, Factual; Drug Design; Drug Evaluation, Preclinical; Ligands;
	Models, Biological; Structure-Activity Relationship},
  owner = {laurent},
  pmid = {16563016},
  timestamp = {2007.09.22},
  url = {http://dx.doi.org/10.1021/ci050065r}
}
@article{Opper2001Universal,
  author = {M. Opper and R. Urbanczik},
  title = {Universal learning curves of support vector machines.},
  journal = {Phys {R}ev {L}ett},
  year = {2001},
  volume = {86},
  pages = {4410-3},
  number = {19},
  month = {May},
  abstract = {Using methods of statistical physics, we investigate the role of model
	complexity in learning with support vector machines ({SVM}s), which
	are an important alternative to neural networks. {W}e show the advantages
	of using {SVM}s with kernels of infinite complexity on noisy target
	rules, which, in contrast to common theoretical beliefs, are found
	to achieve optimal generalization error although the training error
	does not converge to the generalization error. {M}oreover, we find
	a universal asymptotics of the learning curves which depend only
	on the target rule but not on the {SVM} kernel.},
  keywords = {Algorithms, Amino Acid Sequence, Artificial Intelligence, Biological,
	Cell Compartmentation, Chemistry, Comparative Study, Computational
	Biology, Computer Simulation, Computer-Assisted, Databases, Decision
	Trees, Diagnosis, Discriminant Analysis, Electrophysiology, Factual,
	Gastric Emptying, Humans, Logistic Models, Melanoma, Models, Neural
	Networks (Computer), Nevus, Non-U.S. Gov't, Organelles, P.H.S., Physical,
	Pigmented, Predictive Value of Tests, Proteins, Proteome, Reproducibility
	of Results, Research Support, Skin Diseases, Skin Neoplasms, Skin
	Pigmentation, Software, Stomach Diseases, U.S. Gov't, 11328187}
}
@article{Opper2000Gaussian,
  author = {M. Opper and O. Winther},
  title = {Gaussian processes for classification: mean-field algorithms.},
  journal = {Neural {C}omput},
  year = {2000},
  volume = {12},
  pages = {2655-84},
  number = {11},
  month = {Nov},
  abstract = {We derive a mean-field algorithm for binary classification with gaussian
	processes that is based on the {TAP} approach originally proposed
	in statistical physics of disordered systems. {T}he theory also yields
	an approximate leave-one-out estimator for the generalization error,
	which is computed with no extra computational cost. {W}e show that
	from the {TAP} approach, it is possible to derive both a simpler
	"naive" mean-field theory and support vector machines ({SVM}s) as
	limiting cases. {F}or both mean-field algorithms and support vector
	machines, simulation results for three small benchmark data sets
	are presented. {T}hey show that one may get state-of-the-art performance
	by using the leave-one-out estimator for model selection and the
	built-in leave-one-out estimators are extremely precise when compared
	to the exact leave-one-out estimate. {T}he second result is taken
	as strong support for the internal consistency of the mean-field
	approach.},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Animals, Artificial Intelligence, Automated, B-Lymphocytes, Bacterial
	Proteins, Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding
	Sites, Biological, Bone Marrow Cells, Brachyura, Cell Compartmentation,
	Chemistry, Child, Chromosome Aberrations, Classification, Colonic
	Neoplasms, Comparative Study, Computational Biology, Computer Simulation,
	Computer-Assisted, DNA, Data Interpretation, Databases, Decision
	Trees, Diabetes Mellitus, Diagnosis, Discriminant Analysis, Discrimination
	Learning, Electric Conductivity, Electrophysiology, Escherichia coli
	Proteins, Factual, Feedback, Female, Fungal, Gastric Emptying, Gene
	Expression Profiling, Gene Expression Regulation, Genes, Genetic,
	Genetic Markers, Genetic Predisposition to Disease, Hemolysins, Humans,
	Indians, Ion Channels, Kinetics, Leukemia, Likelihood Functions,
	Lipid Bilayers, Logistic Models, Lymphocytic, Male, Markov Chains,
	Melanoma, Models, Molecular, Myeloid, Neoplasm, Neoplasms, Neoplastic,
	Neural Networks (Computer), Neurological, Nevus, Non-P.H.S., Non-U.S.
	Gov't, Nonlinear Dynamics, Normal Distribution, North American, Nucleic
	Acid Conformation, Oligonucleotide Array Sequence Analysis, Organ
	Specificity, Organelles, Ovarian Neoplasms, Ovary, P.H.S., Pattern
	Recognition, Physical, Pigmented, Predictive Value of Tests, Promoter
	Regions (Genetics), Protein Folding, Protein Structure, Proteins,
	Proteome, RNA, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Secondary, Sensitivity and Specificity, Sequence Alignment,
	Sex Characteristics, Skin Diseases, Skin Neoplasms, Skin Pigmentation,
	Software, Sound Spectrography, Statistical, Stomach Diseases, T-Lymphocytes,
	Thermodynamics, Transcription, Transcription Factors, Tumor Markers,
	Type 2, U.S. Gov't, 11110131}
}
@article{Pahikkala2005Contextual,
  author = {Tapio Pahikkala and Filip Ginter and Jorma Boberg and Jouni Jarvinen
	and Tapio Salakoski},
  title = {Contextual weighting for {S}upport {V}ector {M}achines in literature
	mining: an application to gene versus protein name disambiguation.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6},
  pages = {157},
  number = {1},
  month = {Jun},
  abstract = {B{ACKGROUND}: {T}he ability to distinguish between genes and proteins
	is essential for understanding biological text. {S}upport {V}ector
	{M}achines ({SVM}s) have been proven to be very efficient in general
	data mining tasks. {W}e explore their capability for the gene versus
	protein name disambiguation task. {RESULTS}: {W}e incorporated into
	the conventional {SVM} a weighting scheme based on distances of context
	words from the word to be disambiguated. {T}his weighting scheme
	increased the performance of {SVM}s by five percentage points giving
	performance better than 85\% as measured by the area under {ROC}
	curve and outperformed the {W}eighted {A}dditive {C}lassifier, which
	also incorporates the weighting, and the {N}aive {B}ayes classifier.
	{CONCLUSIONS}: {W}e show that the performance of {SVM}s can be improved
	by the proposed weighting scheme. {F}urthermore, our results suggest
	that in this study the increase of the classification performance
	due to the weighting is greater than that obtained by selecting the
	underlying classifier or the kernel part of the {SVM}.},
  doi = {10.1186/1471-2105-6-157},
  pdf = {../local/Pahikkala2005Contextual.pdf},
  file = {Pahikkala2005Contextual.pdf:local/Pahikkala2005Contextual.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-157},
  url = {http://dx.doi.org/10.1186/1471-2105-6-157}
}
@article{Paik2006Gene,
  author = {Paik, Soonmyung and Tang, Gong and Shak, Steven and Kim, Chungyeul
	and Baker, Joffre and Kim, Wanseop and Cronin, Maureen and Baehner,
	Frederick L. and Watson, Drew and Bryant, John and Costantino, Joseph
	P. and Geyer, Jr, Charles E and Wickerham, D Lawrence and Wolmark,
	Norman},
  title = {Gene expression and benefit of chemotherapy in women with node-negative,
	estrogen receptor-positive breast cancer.},
  journal = {J Clin Oncol},
  year = {2006},
  volume = {24},
  pages = {3726--3734},
  number = {23},
  month = {Aug},
  abstract = {The 21-gene recurrence score (RS) assay quantifies the likelihood
	of distant recurrence in women with estrogen receptor-positive, lymph
	node-negative breast cancer treated with adjuvant tamoxifen. The
	relationship between the RS and chemotherapy benefit is not known.The
	RS was measured in tumors from the tamoxifen-treated and tamoxifen
	plus chemotherapy-treated patients in the National Surgical Adjuvant
	Breast and Bowel Project (NSABP) B20 trial. Cox proportional hazards
	models were utilized to test for interaction between chemotherapy
	treatment and the RS.A total of 651 patients were assessable (227
	randomly assigned to tamoxifen and 424 randomly assigned to tamoxifen
	plus chemotherapy). The test for interaction between chemotherapy
	treatment and RS was statistically significant (P = .038). Patients
	with high-RS (> or = 31) tumors (ie, high risk of recurrence) had
	a large benefit from chemotherapy (relative risk, 0.26; 95\% CI,
	0.13 to 0.53; absolute decrease in 10-year distant recurrence rate:
	mean, 27.6\%; SE, 8.0\%). Patients with low-RS (< 18) tumors derived
	minimal, if any, benefit from chemotherapy treatment (relative risk,
	1.31; 95\% CI, 0.46 to 3.78; absolute decrease in distant recurrence
	rate at 10 years: mean, -1.1\%; SE, 2.2\%). Patients with intermediate-RS
	tumors did not appear to have a large benefit, but the uncertainty
	in the estimate can not exclude a clinically important benefit.The
	RS assay not only quantifies the likelihood of breast cancer recurrence
	in women with node-negative, estrogen receptor-positive breast cancer,
	but also predicts the magnitude of chemotherapy benefit.},
  doi = {10.1200/JCO.2005.04.7985},
  institution = {Division of Pathology, Operations Center, and Biostatistical Center,
	National Surgical Adjuvant Breast and Bowel Project, Pittsburgh,
	PA 15212, USA. soon.paik@nsabp.org},
  keywords = {Adult; Aged; Antineoplastic Combined Chemotherapy Protocols, administration
	/&/ dosage/therapeutic use; Breast Neoplasms, drug therapy/metabolism/pathology/prevention
	/&/ control; Cisplatin, administration /&/ dosage; Female; Fluorouracil,
	administration /&/ dosage; Gene Expression Regulation, Neoplastic;
	Humans; Linear Models; Lymphatic Metastasis; Methotrexate, administration
	/&/ dosage; Middle Aged; Mitomycins, administration /&/ dosage; Neoplasm
	Proteins, metabolism; Neoplasm Recurrence, Local, metabolism/prevention
	/&/ control; Odds Ratio; Predictive Value of Tests; Prognosis; Proportional
	Hazards Models; Randomized Controlled Trials as Topic; Receptors,
	Estrogen, metabolism; Recurrence, prevention /&/ control; Reverse
	Transcriptase Polymerase Chain Reaction; Risk Assessment; Risk Factors;
	Tamoxifen, administration /&/ dosage; Tumor Markers, Biological,
	metabolism},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pii = {JCO.2005.04.7985},
  pmid = {16720680},
  timestamp = {2012.03.09},
  url = {http://dx.doi.org/10.1200/JCO.2005.04.7985}
}
@article{Pan2004Comprehensive,
  author = {Fei Pan and Baoying Wang and Xin Hu and William Perrizo},
  title = {Comprehensive vertical sample-based {KNN}/{LSVM} classification for
	gene expression analysis.},
  journal = {J {B}iomed {I}nform},
  year = {2004},
  volume = {37},
  pages = {240-8},
  number = {4},
  month = {Aug},
  abstract = {Classification analysis of microarray gene expression data has been
	widely used to uncover biological features and to distinguish closely
	related cell types that often appear in the diagnosis of cancer.
	{H}owever, the number of dimensions of gene expression data is often
	very high, e.g., in the hundreds or thousands. {A}ccurate and efficient
	classification of such high-dimensional data remains a contemporary
	challenge. {I}n this paper, we propose a comprehensive vertical sample-based
	{KNN}/{LSVM} classification approach with weights optimized by genetic
	algorithms for high-dimensional data. {E}xperiments on common gene
	expression datasets demonstrated that our approach can achieve high
	accuracy and efficiency at the same time. {T}he improvement of speed
	is mainly related to the vertical data representation, {P}-tree,{P}atents
	are pending on the {P}-tree technology. {T}his work is partially
	supported by {GSA} {G}rant {ACT}#:{K}96130308. and its optimized
	logical algebra. {T}he high accuracy is due to the combination of
	a {KNN} majority voting approach and a local support vector machine
	approach that makes optimal decisions at the local level. {A}s a
	result, our approach could be a powerful tool for high-dimensional
	gene expression data analysis.},
  doi = {10.1016/j.jbi.2004.07.003},
  pdf = {../local/Pan2004Comprehensive.pdf},
  file = {Pan2004Comprehensive.pdf:local/Pan2004Comprehensive.pdf:PDF},
  keywords = {biosvm},
  pii = {S1532-0464(04)00070-X},
  url = {http://dx.doi.org/10.1016/j.jbi.2004.07.003}
}
@article{Pandey2000Proteomics,
  author = {Pandey, A. and Mann, M.},
  title = {Proteomics to study genes and genomes},
  journal = {Nature},
  year = {2000},
  volume = {405},
  pages = {837--846},
  pdf = {../local/pand00.pdf},
  file = {pand00.pdf:local/pand00.pdf:PDF},
  subject = {bioprot},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v405/n6788/full/405837a0_fs.html&content_filetype=pdf}
}
@article{Park2003Prediction,
  author = {Park, K.-J. and Kanehisa, M.},
  title = {Prediction of protein subcellular locations by support vector machines
	using compositions of amino acids and amino acid pairs},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1656-1663},
  number = {13},
  abstract = {Motivation: {T}he subcellular location of a protein is closely correlated
	to its function. {T}hus, computational prediction of subcellular
	locations from the amino acid sequence information would help annotation
	and functional prediction of protein coding genes in complete genomes.
	{W}e have developed a method based on support vector machines ({SVM}s).
	{R}esults: {W}e considered 12 subcellular locations in eukaryotic
	cells: chloroplast, cytoplasm, cytoskeleton, endoplasmic reticulum,
	extracellular medium, {G}olgi apparatus, lysosome, mitochondrion,
	nucleus, peroxisome, plasma membrane, and vacuole. {W}e constructed
	a data set of proteins with known locations from the {SWISS}-{PROT}
	database. {A} set of {SVM}s was trained to predict the subcellular
	location of a given protein based on its amino acid, amino acid pair,
	and gapped amino acid pair compositions. {T}he predictors based on
	these different compositions were then combined using a voting scheme.
	{R}esults obtained through 5-fold cross-validation tests showed an
	improvement in prediction accuracy over the algorithm based on the
	amino acid composition only. {T}his prediction method is available
	via the {I}nternet. {A}vailability: http://www.genome.ad.jp/{SIT}/ploc.html
	{S}upplementary information: http://web.kuicr.kyoto-u.ac.jp/~park/{S}eqdata/},
  pdf = {../local/Park2003Prediction.pdf},
  file = {Park2003Prediction.pdf:local/Park2003Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1656}
}
@article{Park2009ChIP,
  author = {Peter J Park},
  title = {ChIP-seq: advantages and challenges of a maturing technology.},
  journal = {Nat Rev Genet},
  year = {2009},
  volume = {10},
  pages = {669--680},
  number = {10},
  month = {Oct},
  abstract = {Chromatin immunoprecipitation followed by sequencing (ChIP-seq) is
	a technique for genome-wide profiling of DNA-binding proteins, histone
	modifications or nucleosomes. Owing to the tremendous progress in
	next-generation sequencing technology, ChIP-seq offers higher resolution,
	less noise and greater coverage than its array-based predecessor
	ChIP-chip. With the decreasing cost of sequencing, ChIP-seq has become
	an indispensable tool for studying gene regulation and epigenetic
	mechanisms. In this Review, I describe the benefits and challenges
	in harnessing this technique with an emphasis on issues related to
	experimental design and data analysis. ChIP-seq experiments generate
	large quantities of data, and effective computational analysis will
	be crucial for uncovering biological mechanisms.},
  doi = {10.1038/nrg2641},
  institution = {Harvard Medical School, 10 Shattuck Street, Boston, MA 02115, USA.
	peter_park@harvard.edu},
  keywords = {Animals; Chromatin Immunoprecipitation, methods; Computational Biology;
	DNA-Binding Proteins, genetics; Epigenesis, Genetic; Humans; Nucleosomes,
	genetics; Sequence Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {nrg2641},
  pmid = {19736561},
  timestamp = {2010.08.05},
  url = {http://dx.doi.org/10.1038/nrg2641}
}
@article{Passerini2004Learning,
  author = {Passerini, A. and Frasconi, P.},
  title = {Learning to discriminate between ligand-bound and disulfide-bound
	cysteines},
  journal = {Protein {E}ng. {D}es. {S}el.},
  year = {2004},
  volume = {17},
  pages = {367-373},
  number = {4},
  abstract = {We present a machine learning method to discriminate between cysteines
	involved in ligand binding and cysteines forming disulfide bridges.
	{O}ur method uses a window of multiple alignment profiles to represent
	each instance and support vector machines with a polynomial kernel
	as the learning algorithm. {W}e also report results obtained with
	two new kernel functions based on similarity matrices. {E}xperimental
	results indicate that binding type can be predicted at significantly
	higher accuracy than using {PROSITE} patterns.},
  doi = {10.1093/protein/gzh042},
  pdf = {../local/Passerini2004Learning.pdf},
  file = {Passerini2004Learning.pdf:local/Passerini2004Learning.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/protein/gzh042}
}
@techreport{Pastor-Satorras2002Evolving,
  author = {Pastor-Satorras, R. and Smith, E. D. and Sol{\'e}, R. V.},
  title = {Evolving protein interaction networks through gene duplication},
  institution = {Santa Fe Institute},
  year = {2002},
  note = {Working paper 02-02-008},
  pdf = {../local/past02.pdf},
  file = {past02.pdf:local/past02.pdf:PDF},
  subject = {bionetprot},
  url = {http://www.santafe.edu/sfi/publications/Abstracts/02-02-008abs.html}
}
@inproceedings{Patterson2002Pre-mRNA,
  author = {Patterson, D.J. and Yasuhara, K. and Ruzzo, W.L.},
  title = {Pre-{m{RNA}} secondary structure prediction aids splice site prediction.},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002},
  year = {2002},
  editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin
	Lauerdale and Teri E. Klein},
  pages = {223-234},
  publisher = {World Scientific},
  abstract = {Accurate splice site prediction is a critical component of any computational
	approach to gene prediction in higher organisms. {E}xisting approaches
	generally use sequence-based models that capture local dependencies
	among nucleotides in a small window around the splice site. {W}e
	present evidence that computationally predicted secondary structure
	of moderate length pre-m{RNA} subsequencies contains information
	that can be exploited to improve acceptor splice site prediction
	beyond that possible with conventional sequence-based approaches.
	{B}oth decision tree and support vector machine classifiers, using
	folding energy and structure metrics characterizing helix formation
	near the splice site, achieve a 5-10% reduction in error rate with
	a human data set. {B}ased on our data, we hypothesize that acceptors
	preferentially exhibit short helices at the splice site.},
  pdf = {../local/Patterson2002Pre-mRNA.pdf},
  file = {Patterson2002Pre-mRNA.pdf:local/Patterson2002Pre-mRNA.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.smi.stanford.edu/projects/helix/psb02/patterson.pdf}
}
@article{Patterson2003Proteomics,
  author = {Scott D Patterson and Ruedi H Aebersold},
  title = {Proteomics: the first decade and beyond.},
  journal = {Nat Genet},
  year = {2003},
  volume = {33 Suppl},
  pages = {311--323},
  month = {Mar},
  abstract = {Proteomics is the systematic study of the many and diverse properties
	of proteins in a parallel manner with the aim of providing detailed
	descriptions of the structure, function and control of biological
	systems in health and disease. Advances in methods and technologies
	have catalyzed an expansion of the scope of biological studies from
	the reductionist biochemical analysis of single proteins to proteome-wide
	measurements. Proteomics and other complementary analysis methods
	are essential components of the emerging 'systems biology' approach
	that seeks to comprehensively describe biological systems through
	integration of diverse types of data and, in the future, to ultimately
	allow computational simulations of complex biological systems.},
  doi = {10.1038/ng1106},
  institution = {Celera Genomics Corporation, 45 West Gude Drive, Rockville, Maryland
	20850, USA. scottp@farmalbiomed.com},
  keywords = {Amino Acid Sequence; Base Sequence; Chromatography, Liquid; Computational
	Biology; DNA; Genetic Techniques; History, 20th Century; History,
	21st Century; Mass Spectrometry; Oligonucleotide Array Sequence Analysis;
	Proteins; Proteomics},
  owner = {phupe},
  pii = {ng1106},
  pmid = {12610541},
  timestamp = {2010.08.13},
  url = {http://dx.doi.org/10.1038/ng1106}
}
@article{Pavey2004Microarray,
  author = {Pavey, S. and Johansson, P. and Packer, L. and Taylor, J. and Stark,
	M. and Pollock, P.M. and Walker, G.J. and Boyle, G.M. and Harper,
	U. and Cozzi, S.J. and Hansen, K. and Yudt, L. and Schmidt, C. and
	Hersey, P. and Ellem, K.A. and O'Rourke, M.G. and Parsons, P.G. and
	Meltzer, P. and Ringner, M. and Hayward, N.K.},
  title = {Microarray expression profiling in melanoma reveals a {BRAF} mutation
	signature},
  journal = {Oncogene},
  year = {2004},
  volume = {23},
  pages = {4060-4067},
  number = {23},
  month = {May},
  abstract = {We have used microarray gene expression profiling and machine learning
	to predict the presence of {BRAF} mutations in a panel of 61 melanoma
	cell lines. {T}he {BRAF} gene was found to be mutated in 42 samples
	(69%) and intragenic mutations of the {NRAS} gene were detected in
	seven samples (11%). {N}o cell line carried mutations of both genes.
	{U}sing support vector machines, we have built a classifier that
	differentiates between melanoma cell lines based on {BRAF} mutation
	status. {A}s few as 83 genes are able to discriminate between {BRAF}
	mutant and {BRAF} wild-type samples with clear separation observed
	using hierarchical clustering. {M}ultidimensional scaling was used
	to visualize the relationship between a {BRAF} mutation signature
	and that of a generalized mitogen-activated protein kinase ({MAPK})
	activation (either {BRAF} or {NRAS} mutation) in the context of the
	discriminating gene list. {W}e observed that samples carrying {NRAS}
	mutations lie somewhere between those with or without {BRAF} mutations.
	{T}hese observations suggest that there are gene-specific mutation
	signals in addition to a common {MAPK} activation that result from
	the pleiotropic effects of either {BRAF} or {NRAS} on other signaling
	pathways, leading to measurably different transcriptional changes.},
  doi = {10.1038/sj.onc.1207563},
  pdf = {../local/Pavey2004Microarray.pdf},
  file = {Pavey2004Microarray.pdf:local/Pavey2004Microarray.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1038/sj.onc.1207563}
}
@inproceedings{Pavlidis2001Promoter,
  author = {P. Pavlidis and T. S. Furey and M. Liberto and D. Haussler and W.
	N. Grundy},
  title = {Promoter {R}egion-{B}ased {C}lassification of {G}enes},
  booktitle = {Pacific {S}ymposium on {B}iocomputing},
  year = {2001},
  pages = {139--150},
  pdf = {../local/pavl01b.pdf},
  file = {pavl01b.pdf:local/pavl01b.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.smi.stanford.edu/projects/helix/psb01/pavlidis.pdf}
}
@article{Pavlidis2004Support,
  author = {Paul Pavlidis and Ilan Wapinski and William Stafford Noble},
  title = {Support vector machine classification on the web.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {586-7},
  number = {4},
  month = {Mar},
  abstract = {The support vector machine ({SVM}) learning algorithm has been widely
	applied in bioinformatics. {W}e have developed a simple web interface
	to our implementation of the {SVM} algorithm, called {G}ist. {T}his
	interface allows novice or occasional users to apply a sophisticated
	machine learning algorithm easily to their data. {M}ore advanced
	users can download the software and source code for local installation.
	{T}he availability of these tools will permit more widespread application
	of this powerful learning algorithm in bioinformatics.},
  doi = {10.1093/bioinformatics/btg461},
  pdf = {../local/Pavlidis2004Support.pdf},
  file = {Pavlidis2004Support.pdf:local/Pavlidis2004Support.pdf:PDF},
  keywords = {Adaptation, Algorithms, Ambergris, Amino Acid Sequence, Animals, Artifacts,
	Artificial Intelligence, Automated, Cadmium, Candida, Candida albicans,
	Capillary, Clinical, Cluster Analysis, Combinatorial Chemistry Techniques,
	Comparative Study, Computational Biology, Computer Simulation, Computer-Assisted,
	Computing Methodologies, Databases, Decision Support Systems, Electrophoresis,
	Enzymes, Europe, Eye Enucleation, Humans, Image Interpretation, Image
	Processing, Information Storage and Retrieval, Internet, Magnetic
	Resonance Imaging, Magnetic Resonance Spectroscopy, Markov Chains,
	Melanoma, Models, Molecular, Molecular Conformation, Molecular Sequence
	Data, Molecular Structure, Neural Networks (Computer), Non-P.H.S.,
	Non-U.S. Gov't, Nonlinear Dynamics, Odors, P.H.S., Pattern Recognition,
	Perfume, Physiological, Predictive Value of Tests, Prognosis, Prospective
	Studies, Protein, Protein Structure, Proteins, Proteomics, Quantitative
	Structure-Activity Relationship, Rats, Reproducibility of Results,
	Research Support, Saccharomyces cerevisiae, Saccharomyces cerevisiae
	Proteins, Secondary, Sensitivity and Specificity, Signal Processing,
	Single-Blind Method, Soft Tissue Neoplasms, Software, Statistical,
	U.S. Gov't, Uveal Neoplasms, Visual, 14990457},
  pii = {btg461},
  url = {http://dx.doi.org/10.1093/bioinformatics/btg461}
}
@inproceedings{Pavlidis2001Gene,
  author = {Pavlidis, P. and Weston, J. and Cai, J. and Grundy, W.N.},
  title = {Gene functional classification from heterogeneous data},
  booktitle = {Proceedings of the {F}ifth {A}nnual {I}nternational {C}onference
	on {C}omputational {B}iology},
  year = {2001},
  pages = {249--255},
  pdf = {../local/pavl01.pdf},
  file = {pavl01.pdf:local/pavl01.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.cs.columbia.edu/compbio/papers/exp-phylo.pdf}
}
@article{Pavlidis2002Learning,
  author = {Pavlidis, P. and Weston, J. and Cai, J. and Noble, W.S.},
  title = {Learning Gene Functional Classifications from Multiple Data Types},
  journal = {J. Comput. Biol.},
  year = {2002},
  volume = {9},
  pages = {401--411},
  number = {2},
  abstract = {In our attempts to understand cellular function at the molecular level,
	we must be able to synthesize information from disparate types of
	genomic data. {W}e consider the problem of inferring gene functional
	classifications from a heterogeneous data set consisting of {DNA}
	microarray expression measurements and phylogenetic profiles from
	whole-genome sequence comparisons. {W}e demonstrate the application
	of the support vector machine ({SVM}) learning algorithm to this
	functional inference task. {O}ur results suggest the importance of
	exploiting prior information about the heterogeneity of the data.
	{I}n particular, we propose an {SVM} kernel function that is explicitly
	heterogeneous. {I}n addition, we describe feature scaling methods
	for further exploiting prior knowledge of heterogeneity by giving
	each data type different weights.},
  doi = {10.1089/10665270252935539},
  pdf = {../local/Pavlidis2002Learning.pdf},
  file = {Pavlidis2002Learning.pdf:local/Pavlidis2002Learning.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Pellegrini1999Assigning,
  author = {Pellegrini, M. and Marcotte, E. M. and Thompson, M. J. and Eisenberg,
	D. and Yeates, T. O.},
  title = {Assigning protein functions by comparative genome analysis: {P}rotein
	phylogenetic profiles},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {1999},
  volume = {96},
  pages = {4285--4288},
  month = {April},
  pdf = {../local/pell99.pdf},
  file = {pell99.pdf:local/pell99.pdf:PDF},
  subject = {bio},
  url = {http://www.pnas.org/cgi/reprint/96/8/4285.pdf}
}
@article{Peng2003Molecular,
  author = {Peng, S. and Xu, Q. and Ling, X.B. and Peng, X. and Du, W. and Chen,
	L.},
  title = {Molecular classification of cancer types from microarray data using
	the combination of genetic algorithms and support vector machines.},
  journal = {F{EBS} {L}ett.},
  year = {2003},
  volume = {555},
  pages = {358-362},
  number = {2},
  abstract = {Simultaneous multiclass classification of tumor types is essential
	for future clinical implementations of microarray-based cancer diagnosis.
	{I}n this study, we have combined genetic algorithms ({GA}s) and
	all paired support vector machines ({SVM}s) for multiclass cancer
	identification. {T}he predictive features have been selected through
	iterative {SVM}s/{GA}s, and recursive feature elimination post-processing
	steps, leading to a very compact cancer-related predictive gene set.
	{L}eave-one-out cross-validations yielded accuracies of 87.93% for
	the eight-class and 85.19% for the fourteen-class cancer classifications,
	outperforming the results derived from previously published methods.},
  doi = {10.1016/S0014-5793(03)01275-4},
  pdf = {../local/Peng2003Molecular.pdf},
  file = {Peng2003Molecular.pdf:local/Peng2003Molecular.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0014-5793(03)01275-4}
}
@article{Peters2005Generating,
  author = {Bjoern Peters and Alessandro Sette},
  title = {Generating quantitative models describing the sequence specificity
	of biological processes with the stabilized matrix method.},
  journal = {BMC Bioinformatics},
  year = {2005},
  volume = {6},
  pages = {132},
  abstract = {BACKGROUND: Many processes in molecular biology involve the recognition
	of short sequences of nucleic-or amino acids, such as the binding
	of immunogenic peptides to major histocompatibility complex (MHC)
	molecules. From experimental data, a model of the sequence specificity
	of these processes can be constructed, such as a sequence motif,
	a scoring matrix or an artificial neural network. The purpose of
	these models is two-fold. First, they can provide a summary of experimental
	results, allowing for a deeper understanding of the mechanisms involved
	in sequence recognition. Second, such models can be used to predict
	the experimental outcome for yet untested sequences. In the past
	we reported the development of a method to generate such models called
	the Stabilized Matrix Method (SMM). This method has been successfully
	applied to predicting peptide binding to MHC molecules, peptide transport
	by the transporter associated with antigen presentation (TAP) and
	proteasomal cleavage of protein sequences. RESULTS: Herein we report
	the implementation of the SMM algorithm as a publicly available software
	package. Specific features determining the type of problems the method
	is most appropriate for are discussed. Advantageous features of the
	package are: (1) the output generated is easy to interpret, (2) input
	and output are both quantitative, (3) specific computational strategies
	to handle experimental noise are built in, (4) the algorithm is designed
	to effectively handle bounded experimental data, (5) experimental
	data from randomized peptide libraries and conventional peptides
	can easily be combined, and (6) it is possible to incorporate pair
	interactions between positions of a sequence. CONCLUSION: Making
	the SMM method publicly available enables bioinformaticians and experimental
	biologists to easily access it, to compare its performance to other
	prediction methods, and to extend it to other applications.},
  doi = {10.1186/1471-2105-6-132},
  keywords = {Algorithms; Amino Acid Sequence; Biology; Computational Biology; Computer
	Simulation; Data Interpretation, Statistical; Databases, Protein;
	Models, Biological; Models, Statistical; Neural Networks (Computer);
	Peptide Library; Peptides; Programming Languages; Prote; Sensitivity
	and Specificity; Software; in Binding},
  owner = {laurent},
  pii = {1471-2105-6-132},
  pmid = {15927070},
  timestamp = {2007.07.12},
  url = {http://dx.doi.org/10.1186/1471-2105-6-132}
}
@article{Pham2005Support,
  author = {Tho Hoan Pham and Kenji Satou and Tu Bao Ho},
  title = {Support vector machines for prediction and analysis of beta and gamma-turns
	in proteins.},
  journal = {J. {B}ioinform. {C}omput. {B}iol.},
  year = {2005},
  volume = {3},
  pages = {343-58},
  number = {2},
  month = {Apr},
  abstract = {Tight turns have long been recognized as one of the three important
	features of proteins, together with alpha-helix and beta-sheet. {T}ight
	turns play an important role in globular proteins from both the structural
	and functional points of view. {M}ore than 90\% tight turns are beta-turns
	and most of the rest are gamma-turns. {A}nalysis and prediction of
	beta-turns and gamma-turns is very useful for design of new molecules
	such as drugs, pesticides, and antigens. {I}n this paper we investigated
	two aspects of applying support vector machine ({SVM}), a promising
	machine learning method for bioinformatics, to prediction and analysis
	of beta-turns and gamma-turns. {F}irst, we developed two {SVM}-based
	methods, called {BTSVM} and {GTSVM}, which predict beta-turns and
	gamma-turns in a protein from its sequence. {W}hen compared with
	other methods, {BTSVM} has a superior performance and {GTSVM} is
	competitive. {S}econd, we used {SVM}s with a linear kernel to estimate
	the support of amino acids for the formation of beta-turns and gamma-turns
	depending on their position in a protein. {O}ur analysis results
	are more comprehensive and easier to use than the previous results
	in designing turns in proteins.},
  keywords = {biosvm},
  pii = {S0219720005001089}
}
@article{Pham2003Prediction,
  author = {Tho Hoan Pham and Kenji Satou and Tu Bao Ho},
  title = {Prediction and analysis of beta-turns in proteins by support vector
	machine.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2003},
  volume = {14},
  pages = {196-205},
  abstract = {Tight turn has long been recognized as one of the three important
	features of proteins after the alpha-helix and beta-sheet. {T}ight
	turns play an important role in globular proteins from both the structural
	and functional points of view. {M}ore than 90\% tight turns are beta-turns.
	{A}nalysis and prediction of beta-turns in particular and tight turns
	in general are very useful for the design of new molecules such as
	drugs, pesticides, and antigens. {I}n this paper, we introduce a
	support vector machine ({SVM}) approach to prediction and analysis
	of beta-turns. {W}e have investigated two aspects of applying {SVM}
	to the prediction and analysis of beta-turns. {F}irst, we developed
	a new {SVM} method, called {BTSVM}, which predicts beta-turns of
	a protein from its sequence. {T}he prediction results on the dataset
	of 426 non-homologous protein chains by sevenfold cross-validation
	technique showed that our method is superior to the other previous
	methods. {S}econd, we analyzed how amino acid positions support (or
	prevent) the formation of beta-turns based on the "multivariable"
	classification model of a linear {SVM}. {T}his model is more general
	than the other ones of previous statistical methods. {O}ur analysis
	results are more comprehensive and easier to use than previously
	published analysis results.},
  keywords = {biosvm}
}
@article{Plewczyski2005support,
  author = {Dariusz Plewczynski and Adrian Tkacz and Adam Godzik and Leszek Rychlewski},
  title = {A support vector machine approach to the identification of phosphorylation
	sites.},
  journal = {Cell {M}ol {B}iol {L}ett},
  year = {2005},
  volume = {10},
  pages = {73-89},
  number = {1},
  abstract = {We describe a bioinformatics tool that can be used to predict the
	position of phosphorylation sites in proteins based only on sequence
	information. {T}he method uses the support vector machine ({SVM})
	statistical learning theory. {T}he statistical models for phosphorylation
	by various types of kinases are built using a dataset of short (9-amino
	acid long) sequence fragments. {T}he sequence segments are dissected
	around post-translationally modified sites of proteins that are on
	the current release of the {S}wiss-{P}rot database, and that were
	experimentally confirmed to be phosphorylated by any kinase. {W}e
	represent them as vectors in a multidimensional abstract space of
	short sequence fragments. {T}he prediction method is as follows.
	{F}irst, a given query protein sequence is dissected into overlapping
	short segments. {A}ll the fragments are then projected into the multidimensional
	space of sequence fragments via a collection of different representations.
	{T}hose points are classified with pre-built statistical models (the
	{SVM} method with linear, polynomial and radial kernel functions)
	either as phosphorylated or inactive ones. {T}he resulting list of
	plausible sites for phosphorylation by various types of kinases in
	the query protein is returned to the user. {T}he efficiency of the
	method for each type of phosphorylation is estimated using leave-one-out
	tests and presented here. {T}he sensitivities of the models can reach
	over 70\%, depending on the type of kinase. {T}he additional information
	from profile representations of short sequence fragments helps in
	gaining a higher degree of accuracy in some phosphorylation types.
	{T}he further development of an automatic phosphorylation site annotation
	predictor based on our algorithm should yield a significant improvement
	when using statistical algorithms in order to quantify the results.},
  pdf = {../local/Plewczyski2005support.pdf},
  file = {Plewczyski2005support.pdf:local/Plewczyski2005support.pdf:PDF},
  keywords = {biosvm}
}
@article{Plewczynski2005AutoMotif,
  author = {Dariusz Plewczynski and Adrian Tkacz and Lucjan Stanislaw Wyrwicz
	and Leszek Rychlewski},
  title = {Auto{M}otif server: prediction of single residue post-translational
	modifications in proteins.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2525-7},
  number = {10},
  month = {May},
  abstract = {The {A}uto{M}otif {S}erver allows for identification of post-translational
	modification ({PTM}) sites in proteins based only on local sequence
	information. {T}he local sequence preferences of short segments around
	{PTM} residues are described here as linear functional motifs ({LFM}s).
	{S}equence models for all types of {PTM}s are trained by support
	vector machine on short-sequence fragments of proteins in the current
	release of {S}wiss-{P}rot database (phosphorylation by various protein
	kinases, sulfation, acetylation, methylation, amidation, etc.). {T}he
	accuracy of the identification is estimated using the standard leave-one-out
	procedure. {T}he sensitivities for all types of short {LFM}s are
	in the range of 70\%. {AVAILABILITY}: {T}he {A}uto{M}otif {S}erver
	is available free for academic use at http://automotif.bioinfo.pl/},
  doi = {10.1093/bioinformatics/bti333},
  pdf = {../local/Plewczynski2005AutoMotif.pdf},
  file = {Plewczynski2005AutoMotif.pdf:local/Plewczynski2005AutoMotif.pdf:PDF},
  keywords = {biosvm},
  pii = {bti333},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti333}
}
@article{Pochet2004Systematic,
  author = {Pochet, N. and De Smet, F. and Suykens, J. A. K. and De Moor, B.
	L. R.},
  title = {Systematic benchmarking of microarray data classification: assessing
	the role of non-linearity and dimensionality reduction},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {3185-3195},
  number = {17},
  month = {Nov},
  abstract = {Motivation: {M}icroarrays are capable of determining the expression
	levels of thousands of genes simultaneously. {I}n combination with
	classification methods, this technology can be useful to support
	clinical management decisions for individual patients, e.g. in oncology.
	{T}he aim of this paper is to systematically benchmark the role of
	non-linear versus linear techniques and dimensionality reduction
	methods. {R}esults: {A} systematic benchmarking study is performed
	by comparing linear versions of standard classification and dimensionality
	reduction techniques with their non-linear versions based on non-linear
	kernel functions with a radial basis function ({RBF}) kernel. {A}
	total of 9 binary cancer classification problems, derived from 7
	publicly available microarray datasets, and 20 randomizations of
	each problem are examined. {C}onclusions: {T}hree main conclusions
	can be formulated based on the performances on independent test sets.
	(1) {W}hen performing classification with least squares support vector
	machines ({LS}-{SVM}s) (without dimensionality reduction), {RBF}
	kernels can be used without risking too much overfitting. {T}he results
	obtained with well-tuned {RBF} kernels are never worse and sometimes
	even statistically significantly better compared to results obtained
	with a linear kernel in terms of test set receiver operating characteristic
	and test set accuracy performances. (2) {E}ven for classification
	with linear classifiers like {LS}-{SVM} with linear kernel, using
	regularization is very important. (3) {W}hen performing kernel principal
	component analysis (kernel {PCA}) before classification, using an
	{RBF} kernel for kernel {PCA} tends to result in overfitting, especially
	when using supervised feature selection. {I}t has been observed that
	an optimal selection of a large number of features is often an indication
	for overfitting. {K}ernel {PCA} with linear kernel gives better results.
	{A}vailability: {M}atlab scripts are available on request. {S}upplementary
	information: http://www.esat.kuleuven.ac.be/~npochet/{B}ioinformatics/},
  doi = {10.1093/bioinformatics/bth383},
  pdf = {../local/Pochet2004Systematic.pdf},
  file = {Pochet2004Systematic.pdf:local/Pochet2004Systematic.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth383}
}
@article{Podani2001Comparable,
  author = {J. Podani and Z.N. Oltvai and H. Jeong and B. Tombor and A.-L. Barab{\'a}si
	and E. Szathm{\'a}ry},
  title = {Comparable system-level organization of {A}rchaea and {E}ukaryotes},
  journal = {Nat. {G}enet.},
  year = {2001},
  volume = {29},
  pages = {54--56},
  pdf = {../local/poda01.pdf},
  file = {poda01.pdf:local/poda01.pdf:PDF},
  subject = {bionet},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/ng/journal/v29/n1/full/ng708.html&filetype=PDF}
}
@article{Poggio1998Sparse,
  author = {Poggio and Girosi},
  title = {A {S}parse {R}epresentation for {F}unction {A}pproximation.},
  journal = {Neural {C}omput},
  year = {1998},
  volume = {10},
  pages = {1445-54},
  number = {6},
  month = {Jul},
  abstract = {We derive a new general representation for a function as a linear
	combination of local correlation kernels at optimal sparse locations
	(and scales) and characterize its relation to principal component
	analysis, regularization, sparsity principles, and support vector
	machines.},
  keywords = {Algorithms, Automated, Biometry, Computers, DNA, Databases, Factual,
	Fungal, Fungal Proteins, GTP-Binding Proteins, Gene Expression, Genes,
	Learning, Markov Chains, Models, Neural Networks (Computer), Neurological,
	Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Hybridization, Open Reading
	Frames, P.H.S., Pattern Recognition, Protein, Protein Structure,
	Proteins, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Sequence Alignment, Sequence Analysis, Software, Statistical,
	Tertiary, U.S. Gov't, 9698352}
}
@article{Pontil1998Properties,
  author = {M. Pontil and A. Verri},
  title = {Properties of support vector machines.},
  journal = {Neural {C}omput},
  year = {1998},
  volume = {10},
  pages = {955-74},
  number = {4},
  month = {May},
  abstract = {Support vector machines ({SVM}s) perform pattern recognition between
	two point classes by finding a decision surface determined by certain
	points of the training set, termed support vectors ({SV}). {T}his
	surface, which in some feature space of possibly infinite dimension
	can be regarded as a hyperplane, is obtained from the solution of
	a problem of quadratic programming that depends on a regularization
	parameter. {I}n this article, we study some mathematical properties
	of support vectors and show that the decision surface can be written
	as the sum of two orthogonal terms, the first depending on only the
	margin vectors (which are {SV}s lying on the margin), the second
	proportional to the regularization parameter. {F}or almost all values
	of the parameter, this enables us to predict how the decision surface
	varies for small parameter changes. {I}n the special but important
	case of feature space of finite dimension m, we also show that m
	+ 1 {SV}s are usually sufficient to determine the decision surface
	fully. {F}or relatively small m, this latter result leads to a consistent
	reduction of the {SV} number.},
  keywords = {Algorithms, Artificial Intelligence, Automated, Biometry, Computers,
	DNA, Databases, Factual, Fungal, Fungal Proteins, GTP-Binding Proteins,
	Gene Expression, Genes, Learning, Linear Models, Markov Chains, Mathematics,
	Models, Neural Networks (Computer), Neurological, Non-P.H.S., Non-U.S.
	Gov't, Nonlinear Dynamics, Nucleic Acid Hybridization, Open Reading
	Frames, P.H.S., Pattern Recognition, Protein, Protein Structure,
	Proteins, Reproducibility of Results, Research Support, Saccharomyces
	cerevisiae, Sequence Alignment, Sequence Analysis, Software, Statistical,
	Tertiary, U.S. Gov't, 9573414}
}
@article{Prados2004Mining,
  author = {Prados, J. and Kalousis, A. and Sanchez, J.C. and Allard, L. and
	Carrette, O. and Hilario, M.},
  title = {Mining mass spectra for diagnosis and biomarker discovery of cerebral
	accidents.},
  journal = {Proteomics},
  year = {2004},
  volume = {4},
  pages = {2320-2332},
  number = {8},
  abstract = {In this paper we try to identify potential biomarkers for early stroke
	diagnosis using surface-enhanced laser desorption/ionization mass
	spectrometry coupled with analysis tools from machine learning and
	data mining. {D}ata consist of 42 specimen samples, i.e., mass spectra
	divided in two big categories, stroke and control specimens. {A}mong
	the stroke specimens two further categories exist that correspond
	to ischemic and hemorrhagic stroke; in this paper we limit our data
	analysis to discriminating between control and stroke specimens.
	{W}e performed two suites of experiments. {I}n the first one we simply
	applied a number of different machine learning algorithms; in the
	second one we have chosen the best performing algorithm as it was
	determined from the first phase and coupled it with a number of different
	feature selection methods. {T}he reason for this was 2-fold, first
	to establish whether feature selection can indeed improve performance,
	which in our case it did not seem to confirm, but more importantly
	to acquire a small list of potentially interesting biomarkers. {O}f
	the different methods explored the most promising one was support
	vector machines which gave us high levels of sensitivity and specificity.
	{F}inally, by analyzing the models constructed by support vector
	machines we produced a small set of 13 features that could be used
	as potential biomarkers, and which exhibited good performance both
	in terms of sensitivity, specificity and model stability.},
  doi = {10.1002/pmic.200400857},
  pdf = {../local/Prados2004Mining.pdf},
  file = {Prados2004Mining.pdf:local/Prados2004Mining.pdf:PDF},
  keywords = {biosvm proteomics},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/pmic.200400857}
}
@article{Prill2005PlosBiol,
  author = {Robert J Prill and Pablo A Iglesias and Andre Levchenko},
  title = {Dynamic properties of network motifs contribute to biological network
	organization.},
  journal = {PLoS Biol},
  year = {2005},
  volume = {3},
  pages = {e343},
  number = {11},
  month = {Nov},
  abstract = {Biological networks, such as those describing gene regulation, signal
	transduction, and neural synapses, are representations of large-scale
	dynamic systems. Discovery of organizing principles of biological
	networks can be enhanced by embracing the notion that there is a
	deep interplay between network structure and system dynamics. Recently,
	many structural characteristics of these non-random networks have
	been identified, but dynamical implications of the features have
	not been explored comprehensively. We demonstrate by exhaustive computational
	analysis that a dynamical property--stability or robustness to small
	perturbations--is highly correlated with the relative abundance of
	small subnetworks (network motifs) in several previously determined
	biological networks. We propose that robust dynamical stability is
	an influential property that can determine the non-random structure
	of biological networks.},
  doi = {10.1371/journal.pbio.0030343},
  institution = {Department of Biomedical Engineering, Johns Hopkins University, Baltimore,
	Maryland, USA.},
  keywords = {Animals; Caenorhabditis elegans, physiology; Computational Biology,
	methods; Computer Simulation; Drosophila melanogaster, physiology;
	Escherichia coli, physiology; Models, Biological; Nerve Net; Saccharomyces
	cerevisiae, physiology; Signal Transduction; Statistics as Topic;
	Systems Theory; Transcription, Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {Andrei Zinovyev},
  pii = {05-PLBI-RA-0233R2},
  pmid = {16187794},
  timestamp = {2011.04.08},
  url = {http://dx.doi.org/10.1371/journal.pbio.0030343}
}
@article{Qian2003Prediction,
  author = {Qian, J. and Lin, J. and Luscombe, N. M. and Yu, H. and Gerstein,
	M.},
  title = {Prediction of regulatory networks: genome-wide identification of
	transcription factor targets from gene expression data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1917-1926},
  number = {15},
  abstract = {Motivation: {D}efining regulatory networks, linking transcription
	factors ({TF}s) to their targets, is a central problem in post-genomic
	biology. {O}ne might imagine one could readily determine these networks
	through inspection of gene expression data. {H}owever, the relationship
	between the expression timecourse of a transcription factor and its
	target is not obvious (e.g. simple correlation over the timecourse),
	and current analysis methods, such as hierarchical clustering, have
	not been very successful in deciphering them. {R}esults: {H}ere we
	introduce an approach based on support vector machines ({SVM}s) to
	predict the targets of a transcription factor by identifying subtle
	relationships between their expression profiles. {I}n particular,
	we used {SVM}s to predict the regulatory targets for 36 transcription
	factors in the {S}accharomyces cerevisiae genome based on the microarray
	expression data from many different physiological conditions. {W}e
	trained and tested our {SVM} on a data set constructed to include
	a significant number of both positive and negative examples, directly
	addressing data imbalance issues. {T}his was non-trivial given that
	most of the known experimental information is only for positives.
	{O}verall, we found that 63% of our {TF}-target relationships were
	confirmed through cross-validation. {W}e further assessed the performance
	of our regulatory network identifications by comparing them with
	the results from two recent genome-wide {C}h{IP}-chip experiments.
	{O}verall, we find the agreement between our results and these experiments
	is comparable to the agreement (albeit low) between the two experiments.
	{W}e find that this network has a delocalized structure with respect
	to chromosomal positioning, with a given transcription factor having
	targets spread fairly uniformly across the genome. {A}vailability:
	{T}he overall network of the relationships is available on the web
	at http://bioinfo.mbb.yale.edu/expression/echipchip},
  pdf = {../local/Qian2003Prediction.pdf},
  file = {Qian2003Prediction.pdf:local/Qian2003Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1917}
}
@article{Qian2001Protein,
  author = {Qian, J. and Luscombe, N. M. and Gerstein, M.},
  title = {Protein {F}old and {F}amily {O}ccurrence in {G}enomes: {P}ower-{L}aw
	{B}ehaviour and {E}volutionary {M}odel},
  journal = {J. {M}ol. {B}iol.},
  year = {2001},
  volume = {313},
  pages = {673--681},
  pdf = {../local/qian01.pdf},
  file = {qian01.pdf:local/qian01.pdf:PDF},
  subject = {bionet},
  url = {http://partslist.org/powerlaw}
}
@article{Qin2003Kernel,
  author = {Qin, J. and Lewis, D. P. and Noble, W. S.},
  title = {Kernel hierarchical gene clustering from microarray expression data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {2097-2104},
  number = {16},
  abstract = {Motivation: {U}nsupervised analysis of microarray gene expression
	data attempts to find biologically significant patterns within a
	given collection of expression measurements. {F}or example, hierarchical
	clustering can be applied to expression profiles of genes across
	multiple experiments, identifying groups of genes that share similiar
	expression profiles. {P}revious work using the support vector machine
	supervised learning algorithm with microarray data suggests that
	higher-order features, such as pairwise and tertiary correlations
	across multiple experiments, may provide significant benefit in learning
	to recognize classes of co-expressed genes. {R}esults: {W}e describe
	a generalization of the hierarchical clustering algorithm that efficiently
	incorporates these higher-order features by using a kernel function
	to map the data into a high-dimensional feature space. {W}e then
	evaluate the utility of the kernel hierarchical clustering algorithm
	using both internal and external validation. {T}he experiments demonstrate
	that the kernel representation itself is insufficient to provide
	improved clustering performance. {W}e conclude that mapping gene
	expression data into a high-dimensional feature space is only a good
	idea when combined with a learning algorithm, such as the support
	vector machine that does not suffer from the curse of dimensionality.
	{A}vailability: {S}upplementary data at www.cs.columbia.edu/compbio/hiclust.
	{S}oftware source code available by request.},
  pdf = {../local/Qin2003Kernel.pdf},
  file = {Qin2003Kernel.pdf:local/Qin2003Kernel.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/16/2097}
}
@article{Raghava2005Correlation,
  author = {Gajendra P S Raghava and Joon H Han},
  title = {Correlation and prediction of gene expression level from amino acid
	and dipeptide composition of its protein.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6},
  pages = {59},
  number = {1},
  month = {Mar},
  abstract = {B{ACKGROUND}: {A} large number of papers have been published on analysis
	of microarray data with particular emphasis on normalization of data,
	detection of differentially expressed genes, clustering of genes
	and regulatory network. {O}n other hand there are only few studies
	on relation between expression level and composition of nucleotide/protein
	sequence, using expression data. {T}here is a need to understand
	why particular genes/proteins express more in particular conditions.
	{I}n this study, we analyze 3468 genes of {S}accharomyces cerevisiae
	obtained from {H}olstege et al., (1998) to understand the relationship
	between expression level and amino acid composition. {RESULTS}: {W}e
	compute the correlation between expression of a gene and amino acid
	composition of its protein. {I}t was observed that some residues
	(like {A}la, {G}ly, {A}rg and {V}al) have significant positive correlation
	(r > 0.20) and some other residues ({L}ike {A}sp, {L}eu, {A}sn and
	{S}er) have negative correlation (r < -0.15) with the expression
	of genes. {A} significant negative correlation (r = -0.18) was also
	found between length and gene expression. {T}hese observations indicate
	the relationship between percent composition and gene expression
	level. {T}hus, attempts have been made to develop a {S}upport {V}ector
	{M}achine ({SVM}) based method for predicting the expression level
	of genes from its protein sequence. {I}n this method the {SVM} is
	trained with proteins whose gene expression data is known in a given
	condition. {T}hen trained {SVM} is used to predict the gene expression
	of other proteins of the same organism in the same condition. {A}
	correlation coefficient r = 0.70 was obtained between predicted and
	experimentally determined expression of genes, which improves from
	r = 0.70 to 0.72 when dipeptide composition was used instead of residue
	composition. {T}he method was evaluated using 5-fold cross validation
	test. {W}e also demonstrate that amino acid composition information
	along with gene expression data can be used for improving the function
	classification of proteins. {CONCLUSION}: {T}here is a correlation
	between gene expression and amino acid composition that can be used
	to predict the expression level of genes up to a certain extent.
	{A} web server based on the above strategy has been developed for
	calculating the correlation between amino acid composition and gene
	expression and prediction of expression level http://kiwi.postech.ac.kr/raghava/lgepred/.
	{T}his server will allow users to study the evolution from expression
	data.},
  doi = {10.1186/1471-2105-6-59},
  keywords = {biosvm},
  pii = {1471-2105-6-59},
  url = {http://dx.doi.org/10.1186/1471-2105-6-59}
}
@article{Rain2001protein-protein,
  author = {Rain, J.-C. and Selig, L. and De Reuse, H. and Battaglia, V. and
	Reverdy, C. and Simon, S. and Lenzen, G. and Petel, F. and Wojcik,
	J. and Sch{\"a}chter, V. and Chemama, Y. and Labigne, A. and Legrain,
	P.},
  title = {The protein-protein interaction map of {H}elicobacter pylori},
  journal = {Nature},
  year = {2001},
  volume = {409},
  pages = {211--215},
  pdf = {../local/rain01.pdf},
  file = {rain01.pdf:local/rain01.pdf:PDF},
  subject = {bionetprot},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v409/n6817/full/409211a0_fs.html&content_filetype=pdf}
}
@article{Ramaswamy2001Multiclass,
  author = {Ramaswamy, S. and Tamayo, P. and Rifkin, R. and Mukherjee, S. and
	Yeang, C.H. and Angelo, M. and Ladd, C. and Reich, M. and Latulippe,
	E. and Mesirov, J.P. and Poggio, T. and Gerald, W. and Loda, M. and
	Lander, E.S. and Golub, T.R.},
  title = {Multiclass cancer diagnosis using tumor gene expression signatures},
  journal = {Proc. {N}atl. {A}cad. {S}ci. {USA}},
  year = {2001},
  volume = {98},
  pages = {15149-15154},
  number = {26},
  month = {Dec},
  abstract = {The optimal treatment of patients with cancer depends on establishing
	accurate diagnoses by using a complex combination of clinical and
	histopathological data. {I}n some instances, this task is difficult
	or impossible because of atypical clinical presentation or histopathology.
	{T}o determine whether the diagnosis of multiple common adult malignancies
	could be achieved purely by molecular classification, we subjected
	218 tumor samples, spanning 14 common tumor types, and 90 normal
	tissue samples to oligonucleotide microarray gene expression analysis.
	{T}he expression levels of 16,063 genes and expressed sequence tags
	were used to evaluate the accuracy of a multiclass classifier based
	on a support vector machine algorithm. {O}verall classification accuracy
	was 78%, far exceeding the accuracy of random classification (9%).
	{P}oorly differentiated cancers resulted in low-confidence predictions
	and could not be accurately classified according to their tissue
	of origin, indicating that they are molecularly distinct entities
	with dramatically different gene expression patterns compared with
	their well differentiated counterparts. {T}aken together, these results
	demonstrate the feasibility of accurate, multiclass molecular cancer
	classification and suggest a strategy for future clinical implementation
	of molecular cancer diagnostics.},
  doi = {10.1073/pnas.211566398},
  pdf = {../local/Ramaswamy2001Multiclass.pdf},
  file = {Ramaswamy2001Multiclass.pdf:local/Ramaswamy2001Multiclass.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {vert},
  url = {http://dx.doi.org/10.1073/pnas.211566398}
}
@article{Rangwala2005Profile-based,
  author = {Rangwala, H. and Karypis, G.},
  title = {Profile-based direct kernels for remote homology detection and fold
	recognition.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {4239--4247},
  number = {23},
  month = {Dec},
  abstract = {MOTIVATION: Protein remote homology detection is a central problem
	in computational biology. Supervised learning algorithms based on
	support vector machines are currently one of the most effective methods
	for remote homology detection. The performance of these methods depends
	on how the protein sequences are modeled and on the method used to
	compute the kernel function between them. RESULTS: We introduce two
	classes of kernel functions that are constructed by combining sequence
	profiles with new and existing approaches for determining the similarity
	between pairs of protein sequences. These kernels are constructed
	directly from these explicit protein similarity measures and employ
	effective profile-to-profile scoring schemes for measuring the similarity
	between pairs of proteins. Experiments with remote homology detection
	and fold recognition problems show that these kernels are capable
	of producing results that are substantially better than those produced
	by all of the existing state-of-the-art SVM-based methods. In addition,
	the experiments show that these kernels, even when used in the absence
	of profiles, produce results that are better than those produced
	by existing non-profile-based schemes. AVAILABILITY: The programs
	for computing the various kernel functions are available on request
	from the authors.},
  doi = {10.1093/bioinformatics/bti687},
  keywords = {biosvm},
  owner = {vert},
  pii = {bti687},
  pmid = {16188929},
  timestamp = {2007.08.01},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti687}
}
@article{Rarey1996fast,
  author = {M. Rarey and B. Kramer and T. Lengauer and G. Klebe},
  title = {{A} fast flexible docking method using an incremental construction
	algorithm.},
  journal = {J. Mol. Biol.},
  year = {1996},
  volume = {261},
  pages = {470--489},
  number = {3},
  month = {Aug},
  abstract = {We present an automatic method for docking organic ligands into protein
	binding sites. The method can be used in the design process of specific
	protein ligands. It combines an appropriate model of the physico-chemical
	properties of the docked molecules with efficient methods for sampling
	the conformational space of the ligand. If the ligand is flexible,
	it can adopt a large variety of different conformations. Each such
	minimum in conformational space presents a potential candidate for
	the conformation of the ligand in the complexed state. Our docking
	method samples the conformation space of the ligand on the basis
	of a discrete model and uses a tree-search technique for placing
	the ligand incrementally into the active site. For placing the first
	fragment of the ligand into the protein, we use hashing techniques
	adapted from computer vision. The incremental construction algorithm
	is based on a greedy strategy combined with efficient methods for
	overlap detection and for the search of new interactions. We present
	results on 19 complexes of which the binding geometry has been crystallographically
	determined. All considered ligands are docked in at most three minutes
	on a current workstation. The experimentally observed binding mode
	of the ligand is reproduced with 0.5 to 1.2 A rms deviation. It is
	almost always found among the highest-ranking conformations computed.},
  doi = {10.1006/jmbi.1996.0477},
  keywords = {Aldehyde Reductase, Algorithms, Amiloride, Aminoimidazole Carboxamide,
	Animals, Arabinose, Automation, Binding Sites, Carbonic Anhydrases,
	Computational Biology, Computer Simulation, Concanavalin A, Crystallography,
	Databases, Drug Design, Drug Evaluation, Enzyme Inhibitors, Factual,
	Folic Acid, Folic Acid Antagonists, Fructose-Bisphosphatase, Humans,
	Internet, Ligands, Methotrexate, Models, Molecular, Non-U.S. Gov't,
	Pancreatic Elastase, Pentamidine, Pliability, Point Mutation, Preclinical,
	Protein Binding, Protein Conformation, Proteins, Reproducibility
	of Results, Research Support, Ribonucleosides, Software, Tetrahydrofolate
	Dehydrogenase, Thermolysin, Time Factors, Trypsin, X-Ray, 8780787},
  owner = {mahe},
  pii = {S0022-2836(96)90477-5},
  pmid = {8780787},
  timestamp = {2006.09.05},
  url = {http://dx.doi.org/10.1006/jmbi.1996.0477}
}
@article{Rensing2005Protein,
  author = {Stefan A Rensing and Dana Fritzowsky and Daniel Lang and Ralf Reski},
  title = {Protein encoding genes in an ancient plant: analysis of codon usage,
	retained genes and splice sites in a moss, {P}hyscomitrella patens.},
  journal = {B{MC} {G}enomics},
  year = {2005},
  volume = {6},
  pages = {43},
  number = {1},
  month = {Mar},
  abstract = {B{ACKGROUND}: {T}he moss {P}hyscomitrella patens is an emerging plant
	model system due to its high rate of homologous recombination, haploidy,
	simple body plan, physiological properties as well as phylogenetic
	position. {A}vailable {EST} data was clustered and assembled, and
	provided the basis for a genome-wide analysis of protein encoding
	genes. {RESULTS}: {W}e have clustered and assembled {P}hyscomitrella
	patens {EST} and {CDS} data in order to represent the transcriptome
	of this non-seed plant. {C}lustering of the publicly available data
	and subsequent prediction resulted in a total of 19,081 non-redundant
	{ORF}. {O}f these putative transcripts, approximately 30\% have a
	homolog in both rice and {A}rabidopsis transcriptome. {M}ore than
	130 transcripts are not present in seed plants but can be found in
	other kingdoms. {T}hese potential "retained genes" might have been
	lost during seed plant evolution. {F}unctional annotation of these
	genes reveals unequal distribution among taxonomic groups and intriguing
	putative functions such as cytotoxicity and nucleic acid repair.
	{W}hereas introns in the moss are larger on average than in the seed
	plant {A}rabidopsis thaliana, position and amount of introns are
	approximately the same. {C}ontrary to {A}rabidopsis, where {CDS}
	contain on average 44\% {G}/{C}, in {P}hyscomitrella the average
	{G}/{C} content is 50\%. {I}nterestingly, moss orthologs of {A}rabidopsis
	genes show a significant drift of codon fraction usage, towards the
	seed plant. {W}hile averaged codon bias is the same in {P}hyscomitrella
	and {A}rabidopsis, the distribution pattern is different, with 15\%
	of moss genes being unbiased. {S}pecies-specific, sensitive and selective
	splice site prediction for {P}hyscomitrella has been developed using
	a dataset of 368 donor and acceptor sites, utilizing a support vector
	machine. {T}he prediction accuracy is better than those achieved
	with tools trained on {A}rabidopsis data. {CONCLUSION}: {A}nalysis
	of the moss transcriptome displays differences in gene structure,
	codon and splice site usage in comparison with the seed plant {A}rabidopsis.
	{P}utative retained genes exhibit possible functions that might explain
	the peculiar physiological properties of mosses. {B}oth the transcriptome
	representation (including a {BLAST} and retrieval service) and splice
	site prediction have been made available on http://www.cosmoss.org,
	setting the basis for assembly and annotation of the {P}hyscomitrella
	genome, of which draft shotgun sequences will become available in
	2005.},
  doi = {10.1186/1471-2164-6-43},
  pdf = {../local/Rensing2005Protein.pdf},
  file = {Rensing2005Protein.pdf:local/Rensing2005Protein.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2164-6-43},
  url = {http://dx.doi.org/10.1186/1471-2164-6-43}
}
@article{Res2005evolution,
  author = {I. Res and I. Mihalek and O. Lichtarge},
  title = {An evolution based classifier for prediction of protein interfaces
	without using protein structures.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2496-501},
  number = {10},
  month = {May},
  abstract = {M{OTIVATION}: {T}he number of available protein structures still lags
	far behind the number of known protein sequences. {T}his makes it
	important to predict which residues participate in protein-protein
	interactions using only sequence information. {F}ew studies have
	tackled this problem until now. {RESULTS}: {W}e applied support vector
	machines to sequences in order to generate a classification of all
	protein residues into those that are part of a protein interface
	and those that are not. {F}or the first time evolutionary information
	was used as one of the attributes and this inclusion of evolutionary
	importance rankings improves the classification. {L}eave-one-out
	cross-validation experiments show that prediction accuracy reaches
	64\%.},
  doi = {10.1093/bioinformatics/bti340},
  pdf = {../local/Res2005evolution.pdf},
  file = {Res2005evolution.pdf:local/Res2005evolution.pdf:PDF},
  keywords = {biosvm},
  pii = {bti340},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti340}
}
@article{Rhodes2007Oncomine,
  author = {Rhodes, Daniel R. and Kalyana-Sundaram, Shanker and Mahavisno, Vasudeva
	and Varambally, Radhika and Yu, Jianjun and Briggs, Benjamin B. and
	Barrette, Terrence R. and Anstet, Matthew J. and Kincead-Beal, Colleen
	and Kulkarni, Prakash and Varambally, Sooryanaryana and Ghosh, Debashis
	and Chinnaiyan, Arul M.},
  title = {Oncomine 3.0: genes, pathways, and networks in a collection of 18,000
	cancer gene expression profiles.},
  journal = {Neoplasia},
  year = {2007},
  volume = {9},
  pages = {166--180},
  number = {2},
  month = {Feb},
  abstract = {DNA microarrays have been widely applied to cancer transcriptome analysis;
	however, the majority of such data are not easily accessible or comparable.
	Furthermore, several important analytic approaches have been applied
	to microarray analysis; however, their application is often limited.
	To overcome these limitations, we have developed Oncomine, a bioinformatics
	initiative aimed at collecting, standardizing, analyzing, and delivering
	cancer transcriptome data to the biomedical research community. Our
	analysis has identified the genes, pathways, and networks deregulated
	across 18,000 cancer gene expression microarrays, spanning the majority
	of cancer types and subtypes. Here, we provide an update on the initiative,
	describe the database and analysis modules, and highlight several
	notable observations. Results from this comprehensive analysis are
	available at http://www.oncomine.org.},
  institution = {Department of Pathology, University of Michigan Medical School, Ann
	Arbor, MI 48109-0940, USA.},
  keywords = {Antineoplastic Agents, pharmacology; Automatic Data Processing; Chromosome
	Mapping; Chromosomes, Human, genetics; Computational Biology, organization
	/&/ administration; Data Collection; Data Display; Data Interpretation,
	Statistical; Databases, Genetic; Drug Design; Gene Expression Profiling,
	statistics /&/ numerical data; Gene Expression Regulation, Neoplastic;
	Genes, Neoplasm; Humans; Internet; Models, Biological; Neoplasm Proteins,
	biosynthesis/chemistry/genetics; Neoplasms, classification/genetics/metabolism;
	Oligonucleotide Array Sequence Analysis; Subtraction Technique; Transcription,
	Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pmid = {17356713},
  timestamp = {2012.03.10}
}
@article{Rice2005Reconstructing,
  author = {Rice, J.J. and Tu, Y. and Stolovitzky, G.},
  title = {Reconstructing biological networks using conditional correlation
	analysis.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {765--773},
  number = {6},
  month = {Mar},
  abstract = {MOTIVATION: One of the present challenges in biological research is
	the organization of the data originating from high-throughput technologies.
	One way in which this information can be organized is in the form
	of networks of influences, physical or statistical, between cellular
	components. We propose an experimental method for probing biological
	networks, analyzing the resulting data and reconstructing the network
	architecture. METHODS: We use networks of known topology consisting
	of nodes (genes), directed edges (gene-gene interactions) and a dynamics
	for the genes' mRNA concentrations in terms of the gene-gene interactions.
	We proposed a network reconstruction algorithm based on the conditional
	correlation of the mRNA equilibrium concentration between two genes
	given that one of them was knocked down. Using simulated gene expression
	data on networks of known connectivity, we investigated how the reconstruction
	error is affected by noise, network topology, size, sparseness and
	dynamic parameters. RESULTS: Errors arise from correlation between
	nodes connected through intermediate nodes (false positives) and
	when the correlation between two directly connected nodes is obscured
	by noise, non-linearity or multiple inputs to the target node (false
	negatives). Two critical components of the method are as follows:
	(1) the choice of an optimal correlation threshold for predicting
	connections and (2) the reduction of errors arising from indirect
	connections (for which a novel algorithm is proposed). With these
	improvements, we can reconstruct networks with the topology of the
	transcriptional regulatory network in Escherichia coli with a reasonably
	low error rate.},
  doi = {10.1093/bioinformatics/bti064},
  institution = {Computational Biology Center, IBM T.J. Watson Research Center, PO
	Box 218, Yorktown Heights, NY 10598, USA.},
  keywords = {Algorithms; Computer Simulation; Gene Expression Profiling; Gene Expression
	Regulation; Models, Biological; Models, Statistical; Oligonucleotide
	Array Sequence Analysis; Protein Interaction Mapping; Signal Transduction;
	Statistics as Topic; Transcription Factors},
  owner = {fantine},
  pii = {bti064},
  pmid = {15486043},
  timestamp = {2010.10.21},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti064}
}
@article{Rice2000EMBOSS,
  author = {P. Rice and I. Longden and A. Bleasby},
  title = {EMBOSS: the European Molecular Biology Open Software Suite.},
  journal = {Trends Genet.},
  year = {2000},
  volume = {16},
  pages = {276--277},
  number = {6},
  month = {Jun},
  institution = {The Sanger Centre, Wellcome Trust Genome Campus, Hinxton, Cambridge,
	UK CB10 1SA.},
  keywords = {Internet; Molecular Biology; Sequence Alignment, methods; Software;
	User-Computer Interface},
  language = {eng},
  medline-pst = {ppublish},
  owner = {bricehoffmann},
  pii = {S0168-9525(00)02024-2},
  pmid = {10827456},
  timestamp = {2009.07.29}
}
@article{Rice2005Mining,
  author = {Simon B Rice and Goran Nenadic and Benjamin J Stapley},
  title = {Mining protein function from text using term-based support vector
	machines.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6 Suppl 1},
  pages = {S22},
  abstract = {B{ACKGROUND}: {T}ext mining has spurred huge interest in the domain
	of biology. {T}he goal of the {B}io{C}re{A}t{I}v{E} exercise was
	to evaluate the performance of current text mining systems. {W}e
	participated in {T}ask 2, which addressed assigning {G}ene {O}ntology
	terms to human proteins and selecting relevant evidence from full-text
	documents. {W}e approached it as a modified form of the document
	classification task. {W}e used a supervised machine-learning approach
	(based on support vector machines) to assign protein function and
	select passages that support the assignments. {A}s classification
	features, we used a protein's co-occurring terms that were automatically
	extracted from documents. {RESULTS}: {T}he results evaluated by curators
	were modest, and quite variable for different problems: in many cases
	we have relatively good assignment of {GO} terms to proteins, but
	the selected supporting text was typically non-relevant (precision
	spanning from 3\% to 50\%). {T}he method appears to work best when
	a substantial set of relevant documents is obtained, while it works
	poorly on single documents and/or short passages. {T}he initial results
	suggest that our approach can also mine annotations from text even
	when an explicit statement relating a protein to a {GO} term is absent.
	{CONCLUSION}: {A} machine learning approach to mining protein function
	predictions from text can yield good performance only if sufficient
	training data is available, and significant amount of supporting
	data is used for prediction. {T}he most promising results are for
	combined document retrieval and {GO} term assignment, which calls
	for the integration of methods developed in {B}io{C}re{A}t{I}v{E}
	{T}ask 1 and {T}ask 2.},
  doi = {10.1186/1471-2105-6-S1-S22},
  pdf = {../local/Rice2005Mining.pdf},
  file = {Rice2005Mining.pdf:local/Rice2005Mining.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-S1-S22},
  url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S22}
}
@article{Riedesel2004Peptide,
  author = {Henning Riedesel and Björn Kolbeck and Oliver Schmetzer and Ernst-Walter
	Knapp},
  title = {Peptide binding at class {I} major histocompatibility complex scored
	with linear functions and support vector machines.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2004},
  volume = {15},
  pages = {198-212},
  number = {1},
  abstract = {We explore two different methods to predict the binding ability of
	nonapeptides at the class {I} major histocompatibility complex using
	a general linear scoring function that defines a separating hyperplane
	in the feature space of sequences. {I}n absence of suitable data
	on non-binding nonapeptides we generated sequences randomly from
	a selected set of proteins from the protein data bank. {T}he parameters
	of the scoring function were determined by a generalized least square
	optimization ({LSM}) and alternatively by the support vector machine
	({SVM}). {W}ith the generalized {LSM} impaired data for learning
	with a small set of binding peptides and a large set of non-binding
	peptides can be treated in a balanced way rendering {LSM} more successful
	than {SVM}, while for symmetric data sets {SVM} has a slight advantage
	compared to {LSM}.},
  pdf = {../local/Riedesel2004Peptide.pdf},
  file = {Riedesel2004Peptide.pdf:local/Riedesel2004Peptide.pdf:PDF},
  keywords = {biosvm},
  url = {http://www.jsbi.org/journal/IBSB04/IBSB04F004.html}
}
@article{Risau-Gusman2000Generalization,
  author = {Risau-Gusman and Gordon},
  title = {Generalization properties of finite-size polynomial support vector
	machines},
  journal = {Phys {R}ev {E} {S}tat {P}hys {P}lasmas {F}luids {R}elat {I}nterdiscip
	{T}opics},
  year = {2000},
  volume = {62},
  pages = {7092-9},
  number = {5 Pt B},
  month = {Nov},
  abstract = {The learning properties of finite-size polynomial support vector machines
	are analyzed in the case of realizable classification tasks. {T}he
	normalization of the high-order features acts as a squeezing factor,
	introducing a strong anisotropy in the patterns distribution in feature
	space. {A}s a function of the training set size, the corresponding
	generalization error presents a crossover, more or less abrupt depending
	on the distribution's anisotropy and on the task to be learned, between
	a fast-decreasing and a slowly decreasing regime. {T}his behavior
	corresponds to the stepwise decrease found by {D}ietrich et al. [{P}hys.
	{R}ev. {L}ett. 82, 2975 (1999)] in the thermodynamic limit. {T}he
	theoretical results are in excellent agreement with the numerical
	simulations.},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Animals, Artificial Intelligence, Automated, B-Lymphocytes, Bacterial
	Proteins, Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding
	Sites, Biological, Bone Marrow Cells, Brachyura, Cell Compartmentation,
	Chemistry, Child, Chromosome Aberrations, Classification, Codon,
	Colonic Neoplasms, Comparative Study, Computational Biology, Computer
	Simulation, Computer-Assisted, DNA, Data Interpretation, Databases,
	Decision Trees, Diabetes Mellitus, Diagnosis, Discriminant Analysis,
	Discrimination Learning, Electric Conductivity, Electrophysiology,
	Escherichia coli Proteins, Factual, Feedback, Female, Fungal, Gastric
	Emptying, Gene Expression Profiling, Gene Expression Regulation,
	Genes, Genetic, Genetic Markers, Genetic Predisposition to Disease,
	Genomics, Hemolysins, Humans, Indians, Initiator, Ion Channels, Kinetics,
	Leukemia, Likelihood Functions, Lipid Bilayers, Logistic Models,
	Lymphocytic, Male, Markov Chains, Melanoma, Models, Molecular, Myeloid,
	Neoplasm, Neoplasms, Neoplastic, Neural Networks (Computer), Neurological,
	Nevus, Non-P.H.S., Non-U.S. Gov't, Nonlinear Dynamics, Normal Distribution,
	North American, Nucleic Acid Conformation, Oligonucleotide Array
	Sequence Analysis, Organ Specificity, Organelles, Ovarian Neoplasms,
	Ovary, P.H.S., Pattern Recognition, Physical, Pigmented, Predictive
	Value of Tests, Promoter Regions (Genetics), Protein Biosynthesis,
	Protein Folding, Protein Structure, Proteins, Proteome, RNA, Reproducibility
	of Results, Research Support, Saccharomyces cerevisiae, Secondary,
	Sensitivity and Specificity, Sequence Alignment, Sequence Analysis,
	Sex Characteristics, Skin Diseases, Skin Neoplasms, Skin Pigmentation,
	Software, Sound Spectrography, Statistical, Stomach Diseases, T-Lymphocytes,
	Thermodynamics, Transcription, Transcription Factors, Tumor Markers,
	Type 2, U.S. Gov't, Vertebrates, 0011102066}
}
@article{Rodriguez-Paredes2011Cancer,
  author = {Manuel Rodríguez-Paredes and Manel Esteller},
  title = {Cancer epigenetics reaches mainstream oncology.},
  journal = {Nat Med},
  year = {2011},
  volume = {17},
  pages = {330--339},
  number = {3},
  month = {Mar},
  abstract = {Epigenetics is one of the most promising and expanding fields in the
	current biomedical research landscape. Since the inception of epigenetics
	in the 1940s, the discoveries regarding its implications in normal
	and disease biology have not stopped, compiling a vast amount of
	knowledge in the past decade. The field has moved from just one recognized
	marker, DNA methylation, to a variety of others, including a wide
	spectrum of histone modifications. From the methodological standpoint,
	the successful initial single gene candidate approaches have been
	complemented by the current comprehensive epigenomic approaches that
	allow the interrogation of genomes to search for translational applications
	in an unbiased manner. Most important, the discovery of mutations
	in the epigenetic machinery and the approval of the first epigenetic
	drugs for the treatment of subtypes of leukemias and lymphomas has
	been an eye-opener for many biomedical scientists and clinicians.
	Herein, we will summarize the progress in the field of cancer epigenetics
	research that has reached mainstream oncology in the development
	of new biomarkers of the disease and new pharmacological strategies.},
  doi = {10.1038/nm.2305},
  institution = {Cancer Epigenetics and Biology Program, Bellvitge Biomedical Research
	Institute, L'Hospitalet, and Department of Physiological Sciences
	II, School of Medicine, University of Barcelona, Barcelona, Spain.},
  keywords = {Amino Acid Sequence; DNA Methylation; Epigenesis, Genetic; Humans;
	Molecular Sequence Data; Neoplasms, genetics/therapy; Tumor Markers,
	Biological},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {nm.2305},
  pmid = {21386836},
  timestamp = {2011.06.04},
  url = {http://dx.doi.org/10.1038/nm.2305}
}
@article{Rose2005Correlation,
  author = {Rose, J. R. and Turkett, W. H., Jr. and Oroian, I. C. and Laegreid,
	W. W. and Keele, J.},
  title = {Correlation of amino acid preference and mammalian viral genome type},
  journal = {Bioinformatics},
  year = {2005},
  abstract = {Motivation: {I}n the event of an outbreak of a disease caused by an
	initially unknown pathogen, the ability to characterize anonymous
	sequences prior to isolation and culturing of the pathogen will be
	helpful. {W}e show that it is possible to classify viral sequences
	by genome type (ds{DNA}, ss{DNA}, ss{RNA} positive strand, ss{RNA}
	negative strand, retroid) using amino acid distribution.{R}esults:
	{I}n this paper we describe the results of analysis of amino acid
	preference in mammalian viruses. {T}he study was carried out at the
	genome level as well as two shorter sequence levels: short (300 amino
	acids) and medium length (660 amino acids). {T}he analysis indicates
	a correlation between the viral genome types ds{DNA}, ss{DNA}, ss{RNA}
	positive strand, ss{RNA} negative strand, and retroid and amino acid
	preference. {W}e investigated three different models of amino acid
	preference. {T}he simplest amino acid preference model, 1-{AAP},
	is a normalized description of the frequency of amino acids in genomes
	of a viral genome type. {A} slightly more complex model is the ordered
	pair amino acid preference model (2-{AAP}), which characterizes genomes
	of different viral genome types by the frequency of ordered pairs
	of amino acids. {T}he most complex and accurate model is the ordered
	triple amino acid preference model (3-{AAP}), which is based on ordered
	triples of amino acids. {T}he results demonstrate that mammalian
	viral genome types differ in their amino acid preference.{A}vailability:
	{T}he tools used to format and analyze data and supplementary material
	are available at http://www.cse.sc.edu/~rose/amino{P}reference/index.html.},
  doi = {10.1093/bioinformatics/bti174.},
  pdf = {../local/Rose2005Correlation.pdf},
  file = {Rose2005Correlation.pdf:local/Rose2005Correlation.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti174v1}
}
@article{Roth1998Finding,
  author = {Roth, F. P. and Hughes, J. D. and Estep, P. W. and Church, G. M.},
  title = {Finding {DNA} regulatory motifs within unaligned noncoding sequences
	clustered by whole-genome mRNA quantitation.},
  journal = {Nat. Biotechnol.},
  year = {1998},
  volume = {16},
  pages = {939--945},
  number = {10},
  month = {October},
  abstract = {Whole-genome mRNA quantitation can be used to identify the genes that
	are most responsive to environmental or genotypic change. By searching
	for mutually similar DNA elements among the upstream non-coding DNA
	sequences of these genes, we can identify candidate regulatory motifs
	and corresponding candidate sets of coregulated genes. We have tested
	this strategy by applying it to three extensively studied regulatory
	systems in the yeast Saccharomyces cerevisiae: galactose response,
	heat shock, and mating type. Galactose-response data yielded the
	known binding site of Gal4, and six of nine genes known to be induced
	by galactose. Heat shock data yielded the cell-cycle activation motif,
	which is known to mediate cell-cycle dependent activation, and a
	set of genes coding for all four nucleosomal proteins. Mating type
	alpha and a data yielded all of the four relevant DNA motifs and
	most of the known a- and alpha-specific genes.},
  address = {Harvard University Graduate Biophysics Program and Harvard Medical
	School Department of Genetics, Boston, MA 02115, USA.},
  doi = {10.1038/nbt1098-939},
  issn = {1087-0156},
  keywords = {bioinformatics, genome-wide, tfs},
  url = {http://dx.doi.org/10.1038/nbt1098-939}
}
@article{Rudd2005Eclair,
  author = {Rudd, S. and Tetko, I. V.},
  title = {Eclair--a web service for unravelling species origin of sequences
	sampled from mixed host interfaces.},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {W724-7},
  number = {Web Server issue},
  month = {Jul},
  abstract = {The identification of the genes that participate at the biological
	interface of two species remains critical to our understanding of
	the mechanisms of disease resistance, disease susceptibility and
	symbiosis. {T}he sequencing of complementary {DNA} (c{DNA}) libraries
	prepared from the biological interface between two organisms provides
	an inexpensive way to identify the novel genes that may be expressed
	as a cause or consequence of compatible or incompatible interactions.
	{S}equence classification and annotation of species origin typically
	use an orthology-based approach and require access to large portions
	of either genome, or a close relative. {N}ovel species- or clade-specific
	sequences may have no counterpart within existing databases and remain
	ambiguous features. {H}ere we present a web-service, {E}clair, which
	utilizes support vector machines for the classification of the origin
	of expressed sequence tags stemming from mixed host c{DNA} libraries.
	{I}n addition to providing an interface for the classification of
	sequences, users are presented with the opportunity to train a model
	to suit their preferred species pair. {E}clair is freely available
	at http://eclair.btk.fi.},
  doi = {10.1093/nar/gki434},
  pdf = {../local/Rudd2005Eclair.pdf},
  file = {Rudd2005Eclair.pdf:local/Rudd2005Eclair.pdf:PDF},
  keywords = {biosvm},
  pii = {33/suppl_2/W724},
  url = {http://dx.doi.org/10.1093/nar/gki434}
}
@article{Ruepp2005Assessment,
  author = {Ruepp, S. and Boess, F. and Suter, L. and de Vera, M. C. and Steiner,
	G. and Steele, T. and Weiser, T. and Albertini, S.},
  title = {Assessment of hepatotoxic liabilities by transcript profiling.},
  journal = {Toxicol {A}ppl {P}harmacol},
  year = {2005},
  month = {Jun},
  abstract = {Male {W}istar rats were treated with various model compounds or the
	appropriate vehicle controls in order to create a reference database
	for toxicogenomics assessment of novel compounds. {H}epatotoxic compounds
	in the database were either known hepatotoxicants or showed hepatotoxicity
	during preclinical testing. {H}istopathology and clinical chemistry
	data were used to anchor the transcript profiles to an established
	endpoint (steatosis, cholestasis, direct acting, peroxisomal proliferation
	or nontoxic/control). {T}hese reference data were analyzed using
	a supervised learning method (support vector machines, {SVM}) to
	generate classification rules. {T}his predictive model was subsequently
	used to assess compounds with regard to a potential hepatotoxic liability.
	{A} steatotic and a non-hepatotoxic 5{HT}(6) receptor antagonist
	compound from the same series were successfully discriminated by
	this toxicogenomics model. {A}dditionally, an example is shown where
	a hepatotoxic liability was correctly recognized in the absence of
	pathological findings. {I}n vitro experiments and a dog study confirmed
	the correctness of the toxicogenomics alert. {A}nother interesting
	observation was that transcript profiles indicate toxicologically
	relevant changes at an earlier timepoint than routinely used methods.
	{T}ogether, these results support the useful application of toxicogenomics
	in raising alerts for adverse effects and generating mechanistic
	hypotheses that can be followed up by confirmatory experiments.},
  doi = {10.1016/j.taap.2005.05.008},
  pdf = {../local/Ruepp2005Assessment.pdf},
  file = {Ruepp2005Assessment.pdf:local/Ruepp2005Assessment.pdf:PDF},
  keywords = {biosvm},
  pii = {S0041-008X(05)00295-4},
  url = {http://dx.doi.org/10.1016/j.taap.2005.05.008}
}
@incollection{Ratsch2004Accurate,
  author = {R{\"a}tsch, G. and Sonnenburg, S.},
  title = {Accurate splice site detection for {C}aenorhabditis elegans},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {277-298},
  abstract = {During the past three years, the support vector machine learning algorithm
	has been extensively applied within the field of computational biology.
	{T}he algorithm has been used to detect patterns within and among
	biological sequences, to classify genes and patients based upon gene
	expression profiles, and has recently been applied to several new
	biological problems. {T}his chapter reviews the state of the art
	with respect to {SVM} applications in computational biology.},
  keywords = {biosvm},
  owner = {vert}
}
@article{Raetsch2005RASE,
  author = {G. R{\"a}tsch and S. Sonnenburg and B. Sch{\"o}lkopf},
  title = {R{ASE}: recognition of alternatively spliced exons in {C}.elegans.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i369-i377},
  number = {Suppl. 1},
  month = {Jun},
  abstract = {M{OTIVATION}: {E}ukaryotic pre-m{RNA}s are spliced to form mature
	m{RNA}. {P}re-m{RNA} alternative splicing greatly increases the complexity
	of gene expression. {E}stimates show that more than half of the human
	genes and at least one-third of the genes of less complex organisms,
	such as nematodes or flies, are alternatively spliced. {I}n this
	work, we consider one major form of alternative splicing, namely
	the exclusion of exons from the transcript. {I}t has been shown that
	alternatively spliced exons have certain properties that distinguish
	them from constitutively spliced exons. {A}lthough most recent computational
	studies on alternative splicing apply only to exons which are conserved
	among two species, our method only uses information that is available
	to the splicing machinery, i.e. the {DNA} sequence itself. {W}e employ
	advanced machine learning techniques in order to answer the following
	two questions: (1) {I}s a certain exon alternatively spliced? (2)
	{H}ow can we identify yet unidentified exons within known introns?
	{RESULTS}: {W}e designed a support vector machine ({SVM}) kernel
	well suited for the task of classifying sequences with motifs having
	positional preferences. {I}n order to solve the task (1), we combine
	the kernel with additional local sequence information, such as lengths
	of the exon and the flanking introns. {T}he resulting {SVM}-based
	classifier achieves a true positive rate of 48.5\% at a false positive
	rate of 1\%. {B}y scanning over single {EST} confirmed exons we identified
	215 potential alternatively spliced exons. {F}or 10 randomly selected
	such exons we successfully performed biological verification experiments
	and confirmed three novel alternatively spliced exons. {T}o answer
	question (2), we additionally used {SVM}-based predictions to recognize
	acceptor and donor splice sites. {C}ombined with the above mentioned
	features we were able to identify 85.2\% of skipped exons within
	known introns at a false positive rate of 1\%. {AVAILABILITY}: {D}atasets,
	model selection results, our predictions and additional experimental
	results are available at http://www.fml.tuebingen.mpg.de/~raetsch/{RASE}
	{CONTACT}: {G}unnar.{R}aetsch@tuebingen.mpg.de {SUPPLEMENTARY} {INFORMATION}:
	http://www.fml.tuebingen.mpg.de/raetsch/{RASE}.},
  doi = {10.1093/bioinformatics/bti1053},
  pdf = {../local/Raetsch2005RASE.pdf},
  file = {Raetsch2005RASE.pdf:local/Raetsch2005RASE.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i369},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1053}
}
@article{Roegnvaldsson2004Why,
  author = {Thorsteinn R{\"o}gnvaldsson and Liwen You},
  title = {Why neural networks should not be used for {HIV}-1 protease cleavage
	site prediction.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {1702-9},
  number = {11},
  month = {Jul},
  abstract = {S{UMMARY}: {S}everal papers have been published where nonlinear machine
	learning algorithms, e.g. artificial neural networks, support vector
	machines and decision trees, have been used to model the specificity
	of the {HIV}-1 protease and extract specificity rules. {W}e show
	that the dataset used in these studies is linearly separable and
	that it is a misuse of nonlinear classifiers to apply them to this
	problem. {T}he best solution on this dataset is achieved using a
	linear classifier like the simple perceptron or the linear support
	vector machine, and it is straightforward to extract rules from these
	linear models. {W}e identify key residues in peptides that are efficiently
	cleaved by the {HIV}-1 protease and list the most prominent rules,
	relating them to experimental results for the {HIV}-1 protease. {MOTIVATION}:
	{U}nderstanding {HIV}-1 protease specificity is important when designing
	{HIV} inhibitors and several different machine learning algorithms
	have been applied to the problem. {H}owever, little progress has
	been made in understanding the specificity because nonlinear and
	overly complex models have been used. {RESULTS}: {W}e show that the
	problem is much easier than what has previously been reported and
	that linear classifiers like the simple perceptron or linear support
	vector machines are at least as good predictors as nonlinear algorithms.
	{W}e also show how sets of specificity rules can be generated from
	the resulting linear classifiers. {AVAILABILITY}: {T}he datasets
	used are available at http://www.hh.se/staff/bioinf/},
  doi = {10.1093/bioinformatics/bth144},
  pdf = {../local/Roegnvaldsson2004Why.pdf},
  file = {Roegnvaldsson2004Why.pdf:local/Roegnvaldsson2004Why.pdf:PDF},
  keywords = {biosvm},
  pii = {bth144},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth144}
}
@article{Saeh2005Lead,
  author = {Saeh, J. and Lyne, P. and Takasaki, B. and Cosgrove, D.},
  title = {Lead hopping using {SVM} and 3{D} pharmacophore fingerprints.},
  journal = {J {C}hem {I}nf {M}odel},
  year = {2005},
  volume = {45},
  pages = {1122-1133},
  number = {4},
  month = {Jul},
  abstract = {The combination of 3{D} pharmacophore fingerprints and the support
	vector machine classification algorithm has been used to generate
	robust models that are able to classify compounds as active or inactive
	in a number of {G}-protein-coupled receptor assays. {T}he models
	have been tested against progressively more challenging validation
	sets where steps are taken to ensure that compounds in the validation
	set are chemically and structurally distinct from the training set.
	{I}n the most challenging example, we simulate a lead-hopping experiment
	by excluding an entire class of compounds (defined by a core substructure)
	from the training set. {T}he left-out active compounds comprised
	approximately 40\% of the actives. {T}he model trained on the remaining
	compounds is able to recall 75\% of the actives from the "new" lead
	series while correctly classifying >99\% of the 5000 inactives included
	in the validation set.},
  doi = {10.1021/ci049732r},
  pdf = {../local/Saeh2005Lead.pdf},
  file = {Saeh2005Lead.pdf:local/Saeh2005Lead.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci049732r}
}
@article{Saetrom2004Predicting,
  author = {Saetrom, P.},
  title = {Predicting the efficacy of short oligonucleotides in antisense and
	{RNA}i experiments with boosted genetic programming},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {3055-3063},
  number = {17},
  abstract = {Motivation: {B}oth small interfering {RNA}s (si{RNA}s) and antisense
	oligonucleotides can selectively block gene expression. {A}lthough
	the two methods rely on different cellular mechanisms, these methods
	share the common property that not all oligonucleotides (oligos)
	are equally effective. {T}hat is, if m{RNA} target sites are picked
	at random, many of the antisense or si{RNA} oligos will not be effective.
	{A}lgorithms that can reliably predict the efficacy of candidate
	oligos can greatly reduce the cost of knockdown experiments, but
	previous attempts to predict the efficacy of antisense oligos have
	had limited success. {M}achine learning has not previously been used
	to predict si{RNA} efficacy. {R}esults: {W}e develop a genetic programming
	based prediction system that shows promising results on both antisense
	and si{RNA} efficacy prediction. {W}e train and evaluate our system
	on a previously published database of antisense efficacies and our
	own database of si{RNA} efficacies collected from the literature.
	{T}he best models gave an overall correlation between predicted and
	observed efficacy of 0.46 on both antisense and si{RNA} data. {A}s
	a comparison, the best correlations of support vector machine classifiers
	trained on the same data were 0.40 and 0.30, respectively. {A}vailability:
	{T}he prediction system uses proprietary hardware and is available
	for both commercial and strategic academic collaborations. {T}he
	si{RNA} database is available upon request.},
  doi = {10.1093/bioinformatics/bth364},
  pdf = {../local/Saetrom2004Predicting.pdf},
  file = {Saetrom2004Predicting.pdf:local/Saetrom2004Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/17/3055}
}
@article{Saeys2004Feature,
  author = {Saeys, Y. and Degroeve, S. and Aeyels, D. and Rouzé, P. and Van
	de Peer, Y.},
  title = {Feature selection for splice site prediction: {A} new method using
	{EDA}-based feature ranking},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {64},
  abstract = {Background {T}he identification of relevant biological features in
	large and complex datasets is an important step towards gaining insight
	in the processes underlying the data. {O}ther advantages of feature
	selection include the ability of the classification system to attain
	good or even better solutions using a restricted subset of features,
	and a faster classification. {T}hus, robust methods for fast feature
	selection are of key importance in extracting knowledge from complex
	biological data. {R}esults {I}n this paper we present a novel method
	for feature subset selection applied to splice site prediction, based
	on estimation of distribution algorithms, a more general framework
	of genetic algorithms. {F}rom the estimated distribution of the algorithm,
	a feature ranking is derived. {A}fterwards this ranking is used to
	iteratively discard features. {W}e apply this technique to the problem
	of splice site prediction, and show how it can be used to gain insight
	into the underlying biological process of splicing. {C}onclusion
	{W}e show that this technique proves to be more robust than the traditional
	use of estimation of distribution algorithms for feature selection:
	instead of returning a single best subset of features (as they normally
	do) this method provides a dynamical view of the feature selection
	process, like the traditional sequential wrapper methods. {H}owever,
	the method is faster than the traditional techniques, and scales
	better to datasets described by a large number of features.},
  doi = {10.1186/1471-2105-5-64},
  pdf = {../local/Saeys2004Feature.pdf},
  file = {Saeys2004Feature.pdf:local/Saeys2004Feature.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Saeys2003Fast,
  author = {Saeys, Y. and Degroeve, S. and Aeyels, D. and Van de Peer, Y. and
	Rouze, P.},
  title = {Fast feature selection using a simple estimation of distribution
	algorithm: a case study on splice site prediction},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {ii179-ii188},
  number = {Suppl. 1},
  abstract = {Motivation: {F}eature subset selection is an important preprocessing
	step for classification. {I}n biology, where structures or processes
	are described by a large number of features, the elimination of irrelevant
	and redundant information in a reasonable amount of time has a number
	of advantages. {I}t enables the classification system to achieve
	good or even better solutions with a restricted subset of features,
	allows for a faster classification, and it helps the human expert
	focus on a relevant subset of features, hence providing useful biological
	knowledge. {R}esults: {W}e present a heuristic method based on {E}stimation
	of {D}istribution {A}lgorithms to select relevant subsets of features
	for splice site prediction in {A}rabidopsis thaliana. {W}e show that
	this method performs a fast detection of relevant feature subsets
	using the technique of constrained feature subsets. {C}ompared to
	the traditional greedy methods the gain in speed can be up to one
	order of magnitude, with results being comparable or even better
	than the greedy methods. {T}his makes it a very practical solution
	for classification tasks that can be solved using a relatively small
	amount of discriminative features (or feature dependencies), but
	where the initial set of potential discriminative features is rather
	large. {K}eywords: {M}achine {L}earning, {F}eature {S}ubset {S}election,
	{E}stimation of {D}istribution {A}lgorithms, {S}plice {S}ite {P}rediction.
	{C}ontact: yvsae@gengenp.rug.ac.be},
  pdf = {../local/Saeys2003Fast.pdf},
  file = {Saeys2003Fast.pdf:local/Saeys2003Fast.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_2/ii179}
}
@article{Saigo2004Protein,
  author = {Saigo, H. and Vert, J.-P. and Ueda, N. and Akutsu, T.},
  title = {Protein homology detection using string alignment kernels},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {1682-1689},
  number = {11},
  abstract = {Motivation: {R}emote homology detection between protein sequences
	is a central problem in computational biology. {D}iscriminative methods
	involving support vector machines ({SVM}s) are currently the most
	effective methods for the problem of superfamily recognition in the
	{S}tructural {C}lassification {O}f {P}roteins ({SCOP}) database.
	{T}he performance of {SVM}s depends critically on the kernel function
	used to quantify the similarity between sequences. {R}esults: {W}e
	propose new kernels for strings adapted to biological sequences,
	which we call local alignment kernels. {T}hese kernels measure the
	similarity between two sequences by summing up scores obtained from
	local alignments with gaps of the sequences. {W}hen tested in combination
	with {SVM} on their ability to recognize {SCOP} superfamilies on
	a benchmark dataset, the new kernels outperform state-of-the-art
	methods for remote homology detection. {A}vailability: {S}oftware
	and data available upon request.},
  pdf = {../local/Saigo2004Protein.pdf},
  file = {Saigo2004Protein.pdf:local/Saigo2004Protein.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/11/1682}
}
@article{Salim2003Combination,
  author = {N. Salim and J. Holliday and P. Willett},
  title = {{C}ombination of fingerprint-based similarity coefficients using
	data fusion.},
  journal = {J Chem Inf Comput Sci},
  year = {2003},
  volume = {43},
  pages = {435--442},
  number = {2},
  abstract = {Many different types of similarity coefficients have been described
	in the literature. Since different coefficients take into account
	different characteristics when assessing the degree of similarity
	between molecules, it is reasonable to combine them to further optimize
	the measures of similarity between molecules. This paper describes
	experiments in which data fusion is used to combine several binary
	similarity coefficients to get an overall estimate of similarity
	for searching databases of bioactive molecules. The results show
	that search performances can be improved by combining coefficients
	with little extra computational cost. However, there is no single
	combination which gives a consistently high performance for all search
	types.},
  doi = {10.1021/ci025596j},
  keywords = {80 and over, Acid-Base Imbalance, Acute, Acute Disease, Adolescent,
	Adult, African Americans, Aged, Anemia, Animals, Anti-HIV Agents,
	Anti-Infective Agents, Antibiotics, Antibodies, Antineoplastic, Antineoplastic
	Agents, Antineoplastic Combined Chemotherapy Protocols, Antitubercular
	Agents, Aorta, Asparaginase, Autoimmune, B-Cell, Bangladesh, Bicarbonates,
	Biological Markers, Blood Glucose, California, Camptothecin, Cellulitis,
	Chorionic Gonadotropin, Chronic Disease, Ciprofloxacin, Clinical
	Protocols, Colorectal Neoplasms, Combination, Comparative Study,
	Daunorubicin, Decision Trees, Dexamethasone, Diabetes Mellitus, Dideoxynucleosides,
	Directly Observed Therapy, Disease Transmission, Drug Administration
	Schedule, Drug Resistance, Drug Therapy, English Abstract, Female,
	Fluorouracil, Follow-Up Studies, Glucose Tolerance Test, Glucosephosphate
	Dehydrogenase, Glyburide, HIV Infections, HIV-1, Health Planning,
	Health Resources, Helminth, Hemolysis, Hemolytic, Hormonal, Hospital
	Mortality, Human, Humans, Hypoglycemic Agents, Immunoglobulin M,
	In Vitro, Incidence, Indinavir, Insulin, Intensive Care Units, Interstitial,
	Lactates, Leucovorin, Leukemia, Male, Maternal Age, Middle Aged,
	Motor Activity, Multidrug-Resistant, Mutation, Nephritis, Non-U.S.
	Gov't, Organoplatinum Compounds, Pennsylvania, Phytotherapy, Plant
	Extracts, Plant Leaves, Population Dynamics, Potassium Channels,
	Prednisone, Pregnancy, Pregnancy Outcome, Prenatal, Prenatal Care,
	Progesterone, Prognosis, Prospective Studies, Pulmonary, Rabbits,
	Randomized Controlled Trials, Rats, Research Support, Retrospective
	Studies, Risk Assessment, Scalp Dermatoses, Schistosomiasis japonica,
	Severity of Illness Index, Spondylarthropathies, Streptozocin, Survival
	Rate, Trauma Centers, Trauma Severity Indices, Tubal, Tuberculosis,
	Type 2, Ultrasonography, Vertical, Vincristine, Viral, Viral Load,
	Wistar, Wounds and Injuries, Ziziphus, beta Subunit, 12653506},
  owner = {mahe},
  pmid = {12653506},
  timestamp = {2006.09.01},
  url = {http://dx.doi.org/10.1021/ci025596j}
}
@article{Salomon2006Predicting,
  author = {Salomon, J. and Flower, D. R.},
  title = {{P}redicting {C}lass {II} {MHC}-{P}eptide binding: a kernel based
	approach using similarity scores.},
  journal = {BMC Bioinformatics},
  year = {2006},
  volume = {7},
  pages = {501},
  abstract = {BACKGROUND: Modelling the interaction between potentially antigenic
	peptides and Major Histocompatibility Complex (MHC) molecules is
	a key step in identifying potential T-cell epitopes. For Class II
	MHC alleles, the binding groove is open at both ends, causing ambiguity
	in the positional alignment between the groove and peptide, as well
	as creating uncertainty as to what parts of the peptide interact
	with the MHC. Moreover, the antigenic peptides have variable lengths,
	making naive modelling methods difficult to apply. This paper introduces
	a kernel method that can handle variable length peptides effectively
	by quantifying similarities between peptide sequences and integrating
	these into the kernel. RESULTS: The kernel approach presented here
	shows increased prediction accuracy with a significantly higher number
	of true positives and negatives on multiple MHC class II alleles,
	when testing data sets from MHCPEP 1, MCHBN 2, and MHCBench 3. Evaluation
	by cross validation, when segregating binders and non-binders, produced
	an average of 0.824 AROC for the MHCBench data sets (up from 0.756),
	and an average of 0.96 AROC for multiple alleles of the MHCPEP database.
	CONCLUSION: The method improves performance over existing state-of-the-art
	methods of MHC class II peptide binding predictions by using a custom,
	knowledge-based representation of peptides. Similarity scores, in
	contrast to a fixed-length, pocket-specific representation of amino
	acids, provide a flexible and powerful way of modelling MHC binding,
	and can easily be applied to other dynamic sequence problems.},
  doi = {10.1186/1471-2105-7-501},
  keywords = {Amino Acid, Binding Sites, Computational Biology, Databases, Epitope
	Mapping, Genetic, HLA-A Antigens, HLA-DR Antigens, Histocompatibility
	Antigens Class II, Humans, Peptides, Protein, Protein Binding, Protein
	Conformation, ROC Curve, Reproducibility of Results, Sequence Alignment,
	Sequence Analysis, Sequence Homology, 17105666},
  pii = {1471-2105-7-501},
  pmid = {17105666},
  timestamp = {2007.01.25},
  url = {http://dx.doi.org/10.1186/1471-2105-7-501}
}
@article{Sanchez-Carbayo2003Gene,
  author = {Marta Sanchez-Carbayo and Nicholas D Socci and Juan Jose Lozano and
	Wentian Li and Elizabeth Charytonowicz and Thomas J Belbin and Michael
	B Prystowsky and Angel R Ortiz and Geoffrey Childs and Carlos Cordon-Cardo},
  title = {Gene discovery in bladder cancer progression using c{DNA} microarrays.},
  journal = {Am. {J}. {P}athol.},
  year = {2003},
  volume = {163},
  pages = {505-16},
  number = {2},
  month = {Aug},
  abstract = {To identify gene expression changes along progression of bladder cancer,
	we compared the expression profiles of early-stage and advanced bladder
	tumors using c{DNA} microarrays containing 17,842 known genes and
	expressed sequence tags. {T}he application of bootstrapping techniques
	to hierarchical clustering segregated early-stage and invasive transitional
	carcinomas into two main clusters. {M}ultidimensional analysis confirmed
	these clusters and more importantly, it separated carcinoma in situ
	from papillary superficial lesions and subgroups within early-stage
	and invasive tumors displaying different overall survival. {A}dditionally,
	it recognized early-stage tumors showing gene profiles similar to
	invasive disease. {D}ifferent techniques including standard t-test,
	single-gene logistic regression, and support vector machine algorithms
	were applied to identify relevant genes involved in bladder cancer
	progression. {C}ytokeratin 20, neuropilin-2, p21, and p33{ING}1 were
	selected among the top ranked molecular targets differentially expressed
	and validated by immunohistochemistry using tissue microarrays (n
	= 173). {T}heir expression patterns were significantly associated
	with pathological stage, tumor grade, and altered retinoblastoma
	({RB}) expression. {M}oreover, p33{ING}1 expression levels were significantly
	associated with overall survival. {A}nalysis of the annotation of
	the most significant genes revealed the relevance of critical genes
	and pathways during bladder cancer progression, including the overexpression
	of oncogenic genes such as {DEK} in superficial tumors or immune
	response genes such as {C}d86 antigen in invasive disease. {G}ene
	profiling successfully classified bladder tumors based on their progression
	and clinical outcome. {T}he present study has identified molecular
	biomarkers of potential clinical significance and critical molecular
	targets associated with bladder cancer progression.},
  pdf = {../local/Sanchez-Carbayo2003Gene.pdf},
  file = {Sanchez-Carbayo2003Gene.pdf:local/Sanchez-Carbayo2003Gene.pdf:PDF},
  keywords = {biosvm},
  url = {http://ajp.amjpathol.org/cgi/content/abstract/163/2/505}
}
@article{Sarda2005pSLIP,
  author = {Deepak Sarda and Gek Huey Chua and Kuo-Bin Li and Arun Krishnan},
  title = {p{SLIP}: {SVM} based protein subcellular localization prediction
	using multiple physicochemical properties.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6},
  pages = {152},
  number = {1},
  month = {Jun},
  abstract = {B{ACKGROUND}: {P}rotein subcellular localization is an important determinant
	of protein function and hence, reliable methods for prediction of
	localization are needed. {A} number of prediction algorithms have
	been developed based on amino acid compositions or on the {N}-terminal
	characteristics (signal peptides) of proteins. {H}owever, such approaches
	lead to a loss of contextual information. {M}oreover, where information
	about the physicochemical properties of amino acids has been used,
	the methods employed to exploit that information are less than optimal
	and could use the information more effectively. {RESULTS}: {I}n this
	paper, we propose a new algorithm called p{SLIP} which uses {S}upport
	{V}ector {M}achines ({SVM}s) in conjunction with multiple physicochemical
	properties of amino acids to predict protein subcellular localization
	in eukaryotes across six different locations, namely, chloroplast,
	cytoplasmic, extracellular, mitochondrial, nuclear and plasma membrane.
	{T}he algorithm was applied to the dataset provided by {P}ark and
	{K}anehisa and we obtained prediction accuracies for the different
	classes ranging from 87.7\%-97.0\% with an overall accuracy of 93.1\%.
	{CONCLUSIONS}: {T}his study presents a physicochemical property based
	protein localization prediction algorithm. {U}nlike other algorithms,
	contextual information is preserved by dividing the protein sequences
	into clusters. {T}he prediction accuracy shows an improvement over
	other algorithms based on various types of amino acid composition
	(single, pair and gapped pair). {W}e have also implemented a web
	server to predict protein localization across the six classes (available
	at http://pslip.bii.a-star.edu.sg).},
  doi = {10.1186/1471-2105-6-152},
  pdf = {../local/Sarda2005pSLIP.pdf},
  file = {Sarda2005pSLIP.pdf:local/Sarda2005pSLIP.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-152},
  url = {http://dx.doi.org/10.1186/1471-2105-6-152}
}
@article{Schneider1998Artificial,
  author = {G. Schneider and P. Wrede},
  title = {{A}rtificial neural networks for computer-based molecular design.},
  journal = {Prog Biophys Mol Biol},
  year = {1998},
  volume = {70},
  pages = {175--222},
  number = {3},
  abstract = {The theory of artificial neural networks is briefly reviewed focusing
	on supervised and unsupervised techniques which have great impact
	on current chemical applications. An introduction to molecular descriptors
	and representation schemes is given. In addition, worked examples
	of recent advances in this field are highlighted and pioneering publications
	are discussed. Applications of several types of artificial neural
	networks to compound classification, modelling of structure-activity
	relationships, biological target identification, and feature extraction
	from biopolymers are presented and compared to other techniques.
	Advantages and limitations of neural networks for computer-aided
	molecular design and sequence analysis are discussed.},
  keywords = {Algorithms, Amino Acid Sequence, Amino Acids, Animals, Artificial
	Intelligence, Automated, Bacterial, Bacterial Proteins, Bicuculline,
	Binding Sites, Biological, Biological Availability, Blood Proteins,
	Blood-Brain Barrier, Cation Transport Proteins, Cats, Cell Membrane
	Permeability, Chemical, Chemistry, Cluster Analysis, Combinatorial
	Chemistry Techniques, Comparative Study, Computational Biology, Computer
	Simulation, Computer Systems, Computer-Aided Design, Computer-Assisted,
	Computing Methodologies, DNA-Binding Proteins, Databases, Dogs, Drug
	Design, Electric Stimulation, Electromyography, Enzyme Inhibitors,
	Ether-A-Go-Go Potassium Channels, Excitatory Amino Acid Antagonists,
	Factual, False Positive Reactions, Forecasting, Forelimb, GABA Antagonists,
	Gene Expression Profiling, Genome, Glutamic Acid, Humans, Hydrogen
	Bonding, Image Enhancement, Image Interpretation, Image Processing,
	Information Storage and Retrieval, Iontophoresis, Kynurenic Acid,
	Least-Squares Analysis, Linear Models, Liver, Markov Chains, Metabolic
	Clearance Rate, Metalloendopeptidases, Microelectrodes, Models, Molecular,
	Molecular Conformation, Molecular Sequence Data, Molecular Structure,
	Motor Cortex, Movement, Multivariate Analysis, Nerve Net, Neural
	Networks (Computer), Neuropeptides, Non-U.S. Gov't, Nonlinear Dynamics,
	Pattern Recognition, Pharmaceutical, Pharmaceutical Preparations,
	Pharmacokinetics, Phylogeny, Potassium Channels, Predictive Value
	of Tests, Protein Interaction Mapping, Protein Sorting Signals, Protein
	Structure, Proteins, Rats, Reproducibility of Results, Research Support,
	Sensitivity and Specificity, Sequence Alignment, Sequence Analysis,
	Shoulder, Signal Processing, Software, Statistical, Stereotaxic Techniques,
	Structure-Activity Relationship, Terminology, Tertiary, Trans-Activators,
	Voltage-Gated, Zinc, 9830312},
  owner = {mahe},
  pii = {S0079610798000261},
  pmid = {9830312},
  timestamp = {2006.09.06}
}
@article{Schwender2004pilot,
  author = {Holger Schwender and Manuela Zucknick and Katja Ickstadt and Hermann
	M Bolt and G. E. N. I. C. A. network},
  title = {A pilot study on the application of statistical classification procedures
	to molecular epidemiological data.},
  journal = {Toxicol {L}ett},
  year = {2004},
  volume = {151},
  pages = {291-9},
  number = {1},
  month = {Jun},
  abstract = {The development of new statistical methods for use in molecular epidemiology
	comprises the building and application of appropriate classification
	rules. {T}he aim of this study was to assess various classification
	methods that can potentially handle genetic interactions. {A} data
	set comprising genotypes at 25 single nucleotide polymorphic ({SNP})
	loci from 518 breast cancer cases and 586 age-matched population-based
	controls from the {GENICA} study was used to built a classification
	rule with the discrimination methods {SVM} (support vector machine),
	{CART} (classification and regression tree), {B}agging, {R}andom
	{F}orest, {L}ogit{B}oost and k nearest neighbours (k{NN}). {A} blind
	pilot analysis of the genotypic data set was a first approach to
	obtain an impression of the statistical structure of the data. {F}urthermore,
	this analysis was performed to explore classification methods that
	may be applied to molecular-epidemiological evaluation. {T}he results
	showed that all blindly applied classification methods had a slightly
	smaller misclassification rate than a random classification. {T}he
	findings, nevertheless, suggest that {SNP} data might be useful for
	the classification of individuals into categories of high or low
	risk of diseases.},
  keywords = {biosvm}
}
@book{Schoelkopf2004Kernel,
  title = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  author = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.-P.},
  address = {The MIT Press, Cambridge, Massachussetts},
  keywords = {biosvm},
  owner = {vert}
}
@inproceedings{Schoelkopf2002Kernel,
  author = {Sch{\"o}lkopf, B. and Weston, J. and Eskin, E. and Leslie, C. and
	Noble, W.S.},
  title = {A {K}ernel {A}pproach for {L}earning from {A}lmost {O}rthogonal {P}atterns},
  booktitle = {Proceedings of {ECML} 2002},
  year = {2002},
  pdf = {../local/scho02.pdf},
  file = {scho02.pdf:local/scho02.pdf:PDF},
  subject = {biokernel},
  url = {http://www.cs.columbia.edu/~cleslie/papers/domdiag.pdf}
}
@article{Seeger2004Gaussian,
  author = {Matthias Seeger},
  title = {Gaussian processes for machine learning.},
  journal = {Int {J} {N}eural {S}yst},
  year = {2004},
  volume = {14},
  pages = {69-106},
  number = {2},
  month = {Apr},
  abstract = {Gaussian processes ({GP}s) are natural generalisations of multivariate
	{G}aussian random variables to infinite (countably or continuous)
	index sets. {GP}s have been applied in a large number of fields to
	a diverse range of ends, and very many deep theoretical analyses
	of various properties are available. {T}his paper gives an introduction
	to {G}aussian processes on a fairly elementary level with special
	emphasis on characteristics relevant in machine learning. {I}t draws
	explicit connections to branches such as spline smoothing models
	and support vector machines in which similar ideas have been investigated.
	{G}aussian process models are routinely used to solve hard machine
	learning problems. {T}hey are attractive because of their flexible
	non-parametric nature and computational simplicity. {T}reated within
	a {B}ayesian framework, very powerful statistical methods can be
	implemented which offer valid estimates of uncertainties in our predictions
	and generic model selection procedures cast as nonlinear optimization
	problems. {T}heir main drawback of heavy computational scaling has
	recently been alleviated by the introduction of generic sparse approximations.13,78,31
	{T}he mathematical literature on {GP}s is large and often uses deep
	concepts which are not required to fully understand most machine
	learning applications. {I}n this tutorial paper, we aim to present
	characteristics of {GP}s relevant to machine learning and to show
	up precise connections to other "kernel machines" popular in the
	community. {O}ur focus is on a simple presentation, but references
	to more detailed sources are provided.},
  keywords = {Algorithms, Amino Acids, Antibodies, Artificial Intelligence, Astrocytoma,
	Automated, Bayes Theorem, Biological, Biopsy, Brain, Brain Mapping,
	Brain Neoplasms, Calibration, Comparative Study, Computational Biology,
	Computer-Assisted, Computing Methodologies, Cysteine, Cystine, Dysplastic
	Nevus Syndrome, Electrodes, Electroencephalography, Entropy, Eosine
	Yellowish-(YS), Evoked Potentials, Female, Gene Expression Profiling,
	Hematoxylin, Horseradish Peroxidase, Humans, Image Interpretation,
	Image Processing, Imagery (Psychotherapy), Imagination, Laterality,
	Linear Models, Male, Melanoma, Models, Monoclonal, Movement, Neoplasms,
	Neural Networks (Computer), Neuropeptides, Non-P.H.S., Non-U.S. Gov't,
	Nonparametric, Normal Distribution, P.H.S., Pattern Recognition,
	Perception, Principal Component Analysis, Protein, Protein Array
	Analysis, Protein Interaction Mapping, Proteins, Regression Analysis,
	Research Support, Sensitivity and Specificity, Sequence Alignment,
	Sequence Ana, Sequence Analysis, Skin Neoplasms, Software, Statistical,
	Statistics, Tumor Markers, U.S. Gov't, User-Computer Interface, World
	Health Organization, lysis, 15112367},
  pii = {S0129065704001899}
}
@inproceedings{Seeger2002Covariance,
  author = {Seeger, M.},
  title = {Covariance {K}ernels from {B}ayesian {G}enerative {M}odels},
  booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.},
  year = {2002},
  volume = {14},
  pages = {905-912},
  pdf = {../local/nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF;nips2001.pdf:http\},
  file = {nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF;nips2001.pdf:http\://www.cs.berkeley.edu/~mseeger/papers/nips2001.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Segal2004module,
  author = {Segal, E. and Friedman, N. and Koller, D. and Regev, A.},
  title = {A module map showing conditional activity of expression modules in
	cancer.},
  journal = {Nat. {G}enet.},
  year = {2004},
  volume = {36},
  pages = {1090--1098},
  number = {10},
  month = {Oct},
  abstract = {D{NA} microarrays are widely used to study changes in gene expression
	in tumors, but such studies are typically system-specific and do
	not address the commonalities and variations between different types
	of tumor. {H}ere we present an integrated analysis of 1,975 published
	microarrays spanning 22 tumor types. {W}e describe expression profiles
	in different tumors in terms of the behavior of modules, sets of
	genes that act in concert to carry out a specific function. {U}sing
	a simple unified analysis, we extract modules and characterize gene-expression
	profiles in tumors as a combination of activated and deactivated
	modules. {A}ctivation of some modules is specific to particular types
	of tumor; for example, a growth-inhibitory module is specifically
	repressed in acute lymphoblastic leukemias and may underlie the deregulated
	proliferation in these cancers. {O}ther modules are shared across
	a diverse set of clinical conditions, suggestive of common tumor
	progression mechanisms. {F}or example, the bone osteoblastic module
	spans a variety of tumor types and includes both secreted growth
	factors and their receptors. {O}ur findings suggest that there is
	a single mechanism for both primary tumor proliferation and metastasis
	to bone. {O}ur analysis presents multiple research directions for
	diagnostic, prognostic and therapeutic studies.},
  doi = {10.1038/ng1434},
  pdf = {../local/Segal2004module.pdf},
  file = {Segal2004module.pdf:local/Segal2004module.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {ng1434},
  pmid = {15448693},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1038/ng1434}
}
@article{Segal2003Module,
  author = {Segal, E. and Shapira, M. and Regev, A. and Pe'er, D. and Botstein,
	D. and Koller, D. and Friedman, N.},
  title = {Module networks: identifying regulatory modules and their condition-specific
	regulators from gene expression data.},
  journal = {Nat. {G}enet.},
  year = {2003},
  volume = {34},
  pages = {166--176},
  number = {2},
  month = {Jun},
  abstract = {Much of a cell's activity is organized as a network of interacting
	modules: sets of genes coregulated to respond to different conditions.
	{W}e present a probabilistic method for identifying regulatory modules
	from gene expression data. {O}ur procedure identifies modules of
	coregulated genes, their regulators and the conditions under which
	regulation occurs, generating testable hypotheses in the form 'regulator
	{X} regulates module {Y} under conditions {W}'. {W}e applied the
	method to a {S}accharomyces cerevisiae expression data set, showing
	its ability to identify functionally coherent modules and their correct
	regulators. {W}e present microarray experiments supporting three
	novel predictions, suggesting regulatory roles for previously uncharacterized
	proteins.},
  doi = {10.1038/ng1165},
  pdf = {../local/Segal2003Module.pdf},
  file = {Segal2003Module.pdf:Segal2003Module.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  pii = {ng1165},
  pmid = {12740579},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1038/ng1165}
}
@article{Segal2003Regression,
  author = {Segal, M. R. and Dahlquist, K. D. and Conklin, B. R.},
  title = {Regression approaches for microarray data analysis.},
  journal = {J. {C}omput. {B}iol.},
  year = {2003},
  volume = {10},
  pages = {961-980},
  number = {6},
  abstract = {A variety of new procedures have been devised to handle the two-sample
	comparison (e.g., tumor versus normal tissue) of gene expression
	values as measured with microarrays. {S}uch new methods are required
	in part because of some defining characteristics of microarray-based
	studies: (i) the very large number of genes contributing expression
	measures which far exceeds the number of samples (observations) available
	and (ii) the fact that by virtue of pathway/network relationships,
	the gene expression measures tend to be highly correlated. {T}hese
	concerns are exacerbated in the regression setting, where the objective
	is to relate gene expression, simultaneously for multiple genes,
	to some external outcome or phenotype. {C}orrespondingly, several
	methods have been recently proposed for addressing these issues.
	{W}e briefly critique some of these methods prior to a detailed evaluation
	of gene harvesting. {T}his reveals that gene harvesting, without
	additional constraints, can yield artifactual solutions. {R}esults
	obtained employing such constraints motivate the use of regularized
	regression procedures such as the lasso, least angle regression,
	and support vector machines. {M}odel selection and solution multiplicity
	issues are also discussed. {T}he methods are evaluated using a microarray-based
	study of cardiomyopathy in transgenic mice.},
  doi = {10.1089/106652703322756177},
  pdf = {../local/Segal2003Regression.pdf},
  file = {Segal2003Regression.pdf:local/Segal2003Regression.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Segal2003Classificationa,
  author = {Segal, N. H. and Pavlidis, P. and Antonescu, C. R. and Maki, R. G.
	and Noble, W. S. and DeSantis, D. and Woodruff, J. M. and Lewis,
	J. J. and Brennan, M. F. and Houghton, A. N. and Cordon-Cardo, C.},
  title = {Classification and {S}ubtype {P}rediction of {A}dult {S}oft {T}issue
	{S}arcoma by {F}unctional {G}enomics},
  journal = {Am. {J}. {P}athol.},
  year = {2003},
  volume = {163},
  pages = {691-700},
  number = {2},
  month = {Aug},
  abstract = {Adult soft tissue sarcomas are a heterogeneous group of tumors, including
	well-described subtypes by histological and genotypic criteria, and
	pleomorphic tumors typically characterized by non-recurrent genetic
	aberrations and karyotypic heterogeneity. {T}he latter pose a diagnostic
	challenge, even to experienced pathologists. {W}e proposed that gene
	expression profiling in soft tissue sarcoma would identify a genomic-based
	classification scheme that is useful in diagnosis. {RNA} samples
	from 51 pathologically confirmed cases, representing nine different
	histological subtypes of adult soft tissue sarcoma, were examined
	using the {A}ffymetrix {U}95{A} {G}ene{C}hip. {S}tatistical tests
	were performed on experimental groups identified by cluster analysis,
	to find discriminating genes that could subsequently be applied in
	a support vector machine algorithm. {S}ynovial sarcomas, round-cell/myxoid
	liposarcomas, clear-cell sarcomas and gastrointestinal stromal tumors
	displayed remarkably distinct and homogenous gene expression profiles.
	{P}leomorphic tumors were heterogeneous. {N}otably, a subset of malignant
	fibrous histiocytomas, a controversialhistological subtype, was identified
	as a distinct genomic group. {T}he support vector machine algorithm
	supported a genomic basis for diagnosis, with both high sensitivity
	and specificity. {I}n conclusion, we showed gene expression profiling
	to be useful in classification and diagnosis, providing insights
	into pathogenesis and pointing to potential new therapeutic targets
	of soft tissue sarcoma.},
  pdf = {../local/Segal2003Classificationa.pdf},
  file = {Segal2003Classificationa.pdf:local/Segal2003Classificationa.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://ajp.amjpathol.org/cgi/content/abstract/163/2/691}
}
@article{Segal2003Classification,
  author = {Segal, N. H. and Pavlidis, P. and Noble, W. S. and Antonescu, C.
	R. and Viale, A. and Wesley, U. V. and Busam, K. and Gallardo, H.
	and DeSantis, D. and Brennan, M. F. and Cordon-Cardo, C. and Wolchok,
	J. D. and Houghton, A. N.},
  title = {Classification of {C}lear-{C}ell {S}arcoma as a {S}ubtype of {M}elanoma
	by {G}enomic {P}rofiling},
  journal = {J. {C}lin. {O}ncol.},
  year = {2003},
  volume = {21},
  pages = {1775-1781},
  number = {9},
  month = {May},
  abstract = {Purpose: {T}o develop a genome-based classification scheme for clear-cell
	sarcoma ({CCS}), also known as melanoma of soft parts ({MSP}), which
	would have implications for diagnosis and treatment. {T}his tumor
	displays characteristic features of soft tissue sarcoma ({STS}),
	including deep soft tissue primary location and a characteristic
	translocation, t(12;22)(q13;q12), involving {EWS} and {ATF}1 genes.
	{CCS}/{MSP} also has typical melanoma features, including immunoreactivity
	for {S}100 and {HMB}45, pigmentation, {MITF}-{M} expression, and
	a propensity for regional lymph node metastases. {M}aterials and
	{M}ethods: {RNA} samples from 21 cell lines and 60 pathologically
	confirmed cases of {STS}, melanoma, and {CCS}/{MSP} were examined
	using the {U}95{A} {G}ene{C}hip ({A}ffymetrix, {S}anta {C}lara, {CA}).
	{H}ierarchical cluster analysis, principal component analysis, and
	support vector machine ({SVM}) analysis exploited genomic correlations
	within the data to classify {CCS}/{MSP}. {R}esults: {U}nsupervised
	analyses demonstrated a clear distinction between {STS} and melanoma
	and, furthermore, showed that {CCS}/{MSP} cluster with the melanomas
	as a distinct group. {A} supervised {SVM} learning approach further
	validated this finding and provided a user-independent approach to
	diagnosis. {G}enes of interest that discriminate {CCS}/{MSP} included
	those encoding melanocyte differentiation antigens, {MITF}, {SOX}10,
	{ERBB}3, and {FGFR}1. {C}onclusion: {G}ene expression profiles support
	the classification of {CCS}/{MSP} as a distinct genomic subtype of
	melanoma. {A}nalysis of these gene profiles using the {SVM} may be
	an important diagnostic tool. {G}enomic analysis identified potential
	targets for the development of therapeutic strategies in the treatment
	of this disease.},
  doi = {10.1200/JCO.2003.10.108},
  pdf = {../local/Segal2003Classification.pdf},
  file = {Segal2003Classification.pdf:local/Segal2003Classification.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1200/JCO.2003.10.108}
}
@article{Seike2005Proteomic,
  author = {Seike, M. and Kondo, T. and Fujii, K. and Okano, T. and Yamada, T.
	and Matsuno, Y. and Gemma, A. and Kudoh, S. and Hirohashi, S.},
  title = {Proteomic signatures for histological types of lung cancer.},
  journal = {Proteomics},
  year = {2005},
  month = {Jul},
  abstract = {We performed proteomic studies on lung cancer cells to elucidate the
	mechanisms that determine histological phenotype. {T}hirty lung cancer
	cell lines with three different histological backgrounds (squamous
	cell carcinoma, small cell lung carcinoma and adenocarcinoma) were
	subjected to two-dimensional difference gel electrophoresis (2-{D}
	{DIGE}) and grouped by multivariate analyses on the basis of their
	protein expression profiles. 2-{D} {DIGE} achieves more accurate
	quantification of protein expression by using highly sensitive fluorescence
	dyes to label the cysteine residues of proteins prior to two-dimensional
	polyacrylamide gel electrophoresis. {W}e found that hierarchical
	clustering analysis and principal component analysis divided the
	cell lines according to their original histology. {S}pot ranking
	analysis using a support vector machine algorithm and unsupervised
	classification methods identified 32 protein spots essential for
	the classification. {T}he proteins corresponding to the spots were
	identified by mass spectrometry. {N}ext, lung cancer cells isolated
	from tumor tissue by laser microdissection were classified on the
	basis of the expression pattern of these 32 protein spots. {B}ased
	on the expression profile of the 32 spots, the isolated cancer cells
	were categorized into three histological groups: the squamous cell
	carcinoma group, the adenocarcinoma group, and a group of carcinomas
	with other histological types. {I}n conclusion, our results demonstrate
	the utility of quantitative proteomic analysis for molecular diagnosis
	and classification of lung cancer cells.},
  doi = {10.1002/pmic.200401166},
  pdf = {../local/Seike2005Proteomic.pdf},
  file = {Seike2005Proteomic.pdf:local/Seike2005Proteomic.pdf:PDF},
  keywords = {biosvm proteomics},
  url = {http://dx.doi.org/10.1002/pmic.200401166}
}
@article{Sen2004Predicting,
  author = {Sen, T.Z. and Kloczkowski, A. and Jernigan, R.L. and Yan, C. and
	Honavar, V. and Ho, K.M. and Wang, C.Z. and Ihm, Y. and Cao, H. and
	Gu, X. and Dobbs, D.},
  title = {Predicting binding sites of hydrolase-inhibitor complexes by combining
	several methods.},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {205},
  abstract = {Background {P}rotein-protein interactions play a critical role in
	protein function. {C}ompletion of many genomes is being followed
	rapidly by major efforts to identify interacting protein pairs experimentally
	in order to decipher the networks of interacting, coordinated-in-action
	proteins. {I}dentification of protein-protein interaction sites and
	detection of specific amino acids that contribute to the specificity
	and the strength of protein interactions is an important problem
	with broad applications ranging from rational drug design to the
	analysis of metabolic and signal transduction networks. {R}esults
	{I}n order to increase the power of predictive methods for protein-protein
	interaction sites, we have developed a consensus methodology for
	combining four different methods. {T}hese approaches include: data
	mining using {S}upport {V}ector {M}achines, threading through protein
	structures, prediction of conserved residues on the protein surface
	by analysis of phylogenetic trees, and the {C}onservatism of {C}onservatism
	method of {M}irny and {S}hakhnovich. {R}esults obtained on a dataset
	of hydrolase-inhibitor complexes demonstrate that the combination
	of all four methods yield improved predictions over the individual
	methods. {C}onclusions {W}e developed a consensus method for predicting
	protein-protein interface residues by combining sequence and structure-based
	methods. {T}he success of our consensus approach suggests that similar
	methodologies can be developed to improve prediction accuracies for
	other bioinformatic problems.},
  doi = {10.1186/1471-2105-5-205},
  pdf = {../local/Sen2004Predicting.pdf},
  file = {Sen2004Predicting.pdf:local/Sen2004Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Senawongse2005Predicting,
  author = {Pasak Senawongse and Andrew R Dalby and Zheng Rong Yang},
  title = {Predicting the phosphorylation sites using hidden markov models and
	machine learning methods.},
  journal = {J {C}hem {I}nf {M}odel},
  year = {2005},
  volume = {45},
  pages = {1147-52},
  number = {4},
  abstract = {Accurately predicting phosphorylation sites in proteins is an important
	issue in postgenomics, for which how to efficiently extract the most
	predictive features from amino acid sequences for modeling is still
	challenging. {A}lthough both the distributed encoding method and
	the bio-basis function method work well, they still have some limits
	in use. {T}he distributed encoding method is unable to code the biological
	content in sequences efficiently, whereas the bio-basis function
	method is a nonparametric method, which is often computationally
	expensive. {A}s hidden {M}arkov models ({HMM}s) can be used to generate
	one model for one cluster of aligned protein sequences, the aim in
	this study is to use {HMM}s to extract features from amino acid sequences,
	where sequence clusters are determined using available biological
	knowledge. {I}n this novel method, {HMM}s are first constructed using
	functional sequences only. {B}oth functional and nonfunctional training
	sequences are then inputted into the trained {HMM}s to generate functional
	and nonfunctional feature vectors. {F}rom this, a machine learning
	algorithm is used to construct a classifier based on these feature
	vectors. {I}t is found in this work that (1) this method provides
	much better prediction accuracy than the use of {HMM}s only for prediction,
	and (2) the support vector machines ({SVM}s) algorithm outperforms
	decision trees and neural network algorithms when they are constructed
	on the features extracted using the trained {HMM}s.},
  doi = {10.1021/ci050047+},
  pdf = {../local/Senawongse2005Predicting.pdf},
  file = {Senawongse2005Predicting.pdf:local/Senawongse2005Predicting.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci050047+}
}
@article{Serra2003Development,
  author = {Serra, J.R. and Thompson, E.D. and Jurs, P.C.},
  title = {Development of binary classification of structural chromosome aberrations
	for a diverse set of organic compounds from molecular structure},
  journal = {Chem. {R}es. {T}oxicol.},
  year = {2003},
  volume = {16},
  pages = {153-163},
  number = {2},
  abstract = {Classification models are generated to predict in vitro cytogenetic
	results for a diverse set of 383 organic compounds. {B}oth k-nearest
	neighbor and support vector machine models are developed. {T}hey
	are based on calculated molecular structure descriptors. {E}ndpoints
	used are the labels clastogenic or nonclastogenic according to an
	in vitro chromosomal aberration assay with {C}hinese hamster lung
	cells. {C}ompounds that were tested with both a 24 and 48 h exposure
	are included. {E}ach compound is represented by calculated molecular
	structure descriptors encoding the topological, electronic, geometrical,
	or polar surface area aspects of the structure. {S}ubsets of informative
	descriptors are identified with genetic algorithm feature selection
	coupled to the appropriate classification algorithm. {T}he overall
	classification success rate for a k-nearest neighbor classifier built
	with just six topological descriptors is 81.2% for the training set
	and 86.5% for an external prediction set. {T}he overall classification
	success rate for a three-descriptor support vector machine model
	is 99.7% for the training set, 92.1% for the cross-validation set,
	and 83.8% for an external prediction set.},
  doi = {10.1021/tx020077w},
  pdf = {../local/Serra2003Development.pdf},
  file = {Serra2003Development.pdf:local/Serra2003Development.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1021/tx020077w}
}
@article{Shannon2003Analyzing,
  author = {William Shannon and Robert Culverhouse and Jill Duncan},
  title = {Analyzing microarray data using cluster analysis.},
  journal = {Pharmacogenomics},
  year = {2003},
  volume = {4},
  pages = {41-52},
  number = {1},
  month = {Jan},
  abstract = {As pharmacogenetics researchers gather more detailed and complex data
	on gene polymorphisms that effect drug metabolizing enzymes, drug
	target receptors and drug transporters, they will need access to
	advanced statistical tools to mine that data. {T}hese tools include
	approaches from classical biostatistics, such as logistic regression
	or linear discriminant analysis, and supervised learning methods
	from computer science, such as support vector machines and artificial
	neural networks. {I}n this review, we present an overview of another
	class of models, cluster analysis, which will likely be less familiar
	to pharmacogenetics researchers. {C}luster analysis is used to analyze
	data that is not a priori known to contain any specific subgroups.
	{T}he goal is to use the data itself to identify meaningful or informative
	subgroups. {S}pecifically, we will focus on demonstrating the use
	of distance-based methods of hierarchical clustering to analyze gene
	expression data.},
  keywords = {Algorithms, Automated, Base Pair Mismatch, Base Pairing, Base Sequence,
	Biosensing Techniques, Cluster Analysis, Comparative Study, Computer-Assisted,
	DNA, Gene Expression Profiling, Gene Expression Regulation, Genes,
	Hemolysins, Humans, Markov Chains, Messenger, Molecular Probe Techniques,
	Molecular Sequence Data, Nanotechnology, Neoplastic, Neural Networks
	(Computer), Non-U.S. Gov't, Nucleic Acid Conformation, Oligonucleotide
	Array Sequence Analysis, Pattern Recognition, Quality Control, RNA,
	Research Support, Signal Processing, Stomach Neoplasms, 12517285}
}
@article{Sharan2005motif-based,
  author = {R. Sharan and E. W Myers},
  title = {A motif-based framework for recognizing sequence families.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21 Suppl 1},
  pages = {i387-i393},
  month = {Jun},
  abstract = {M{OTIVATION}: {M}any signals in biological sequences are based on
	the presence or absence of base signals and their spatial combinations.
	{O}ne of the best known examples of this is the signal identifying
	a core promoter-the site at which the basal transcription machinery
	starts the transcription of a gene. {O}ur goal is a fully automatic
	pattern recognition system for a family of sequences, which simultaneously
	discovers the base signals, their spatial relationships and a classifier
	based upon them. {RESULTS}: {I}n this paper we present a general
	method for characterizing a set of sequences by their recurrent motifs.
	{O}ur approach relies on novel probabilistic models for {DNA} binding
	sites and modules of binding sites, on algorithms to study them from
	the data and on a support vector machine that uses the models studied
	to classify a set of sequences. {W}e demonstrate the applicability
	of our approach to diverse instances, ranging from families of promoter
	sequences to a dataset of intronic sequences flanking alternatively
	spliced exons. {O}n a core promoter dataset our results are comparable
	with the state-of-the-art {M}c{P}romoter. {O}n a dataset of alternatively
	spliced exons we outperform a previous approach. {W}e also achieve
	high success rates in recognizing cell cycle regulated genes. {T}hese
	results demonstrate that a fully automatic pattern recognition algorithm
	can meet or exceed the performance of hand-crafted approaches. {AVAILABILITY}:
	{T}he software and datasets are available from the authors upon request.
	{CONTACT}: roded@tau.ac.il.},
  doi = {10.1093/bioinformatics/bti1002},
  pdf = {../local/Sharan2005motif-based.pdf},
  file = {Sharan2005motif-based.pdf:local/Sharan2005motif-based.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i387},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1002}
}
@inproceedings{She2003Frequent-subsequence-based,
  author = {She, R. and Chen, F. and Wang, K. and Ester, M. and Gardy, J.L. and
	Brinkman, F.S.L.},
  title = {Frequent-subsequence-based prediction of outer membrane proteins},
  booktitle = {K{DD} '03: {P}roceedings of the ninth {ACM} {SIGKDD} international
	conference on {K}nowledge discovery and data mining},
  year = {2003},
  pages = {436-445},
  publisher = {ACM Press},
  abstract = {A number of medically important disease-causing bacteria (collectively
	called {G}ram-negative bacteria) are noted for the extra "outer"
	membrane that surrounds their cell. {P}roteins resident in this membrane
	(outer membrane proteins, or {OMP}s) are of primary research interest
	for antibiotic and vaccine drug design as they are on the surface
	of the bacteria and so are the most accessible targets to develop
	new drugs against. {W}ith the development of genome sequencing technology
	and bioinformatics, biologists can now deduce all the proteins that
	are likely produced in a given bacteria and have attempted to classify
	where proteins are located in a bacterial cell. {H}owever such protein
	localization programs are currently least accurate when predicting
	{OMP}s, and so there is a current need for the development of a better
	{OMP} classifier. {D}ata mining research suggests that the use of
	frequent patterns has good performance in aiding the development
	of accurate and efficient classification algorithms. {I}n this paper,
	we present two methods to identify {OMP}s based on frequent subsequences
	and test them on all {G}ram-negative bacterial proteins whose localizations
	have been determined by biological experiments. {O}ne classifier
	follows an association rule approach, while the other is based on
	support vector machines ({SVM}s). {W}e compare the proposed methods
	with the state-of-the-art methods in the biological domain. {T}he
	results demonstrate that our methods are better both in terms of
	accurately identifying {OMP}s and providing biological insights that
	increase our understanding of the structures and functions of these
	important proteins.},
  doi = {10.1145/956750.956800},
  pdf = {../local/She2003Frequent-subsequence-based.pdf},
  file = {She2003Frequent-subsequence-based.pdf:local/She2003Frequent-subsequence-based.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Sherry2001dbSNP,
  author = {S. T. Sherry and M. H. Ward and M. Kholodov and J. Baker and L. Phan
	and E. M. Smigielski and K. Sirotkin},
  title = {dbSNP: the NCBI database of genetic variation.},
  journal = {Nucleic Acids Res},
  year = {2001},
  volume = {29},
  pages = {308--311},
  number = {1},
  month = {Jan},
  abstract = {In response to a need for a general catalog of genome variation to
	address the large-scale sampling designs required by association
	studies, gene mapping and evolutionary biology, the National Center
	for Biotechnology Information (NCBI) has established the dbSNP database
	[S.T.Sherry, M.Ward and K. Sirotkin (1999) Genome Res., 9, 677-679].
	Submissions to dbSNP will be integrated with other sources of information
	at NCBI such as GenBank, PubMed, LocusLink and the Human Genome Project
	data. The complete contents of dbSNP are available to the public
	at website: http://www.ncbi.nlm.nih.gov/SNP. The complete contents
	of dbSNP can also be downloaded in multiple formats via anonymous
	FTP at ftp://ncbi.nlm.nih.gov/snp/.},
  institution = {National Center for Biotechnology Information, National Library of
	Medicine, National Institutes of Health, Bethesda, MD, 20894, USA.
	sherry@ncbi.nlm.nih.gov},
  keywords = {Animals; Biotechnology; Databases, Factual; Genetic Variation; Humans;
	Information Services; Internet; National Institutes of Health (U.S.);
	National Library of Medicine (U.S.); Polymorphism, Single Nucleotide,
	genetics; United States},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pmid = {11125122},
  timestamp = {2010.08.01}
}
@article{Shi2005Building,
  author = {Lei Shi and Fabien Campagne},
  title = {Building a protein name dictionary from full text: a machine learning
	term extraction approach.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6},
  pages = {88},
  number = {1},
  month = {Apr},
  abstract = {B{ACKGROUND}: {T}he majority of information in the biological literature
	resides in full text articles, instead of abstracts. {Y}et, abstracts
	remain the focus of many publicly available literature data mining
	tools. {M}ost literature mining tools rely on pre-existing lexicons
	of biological names, often extracted from curated gene or protein
	databases. {T}his is a limitation, because such databases have low
	coverage of the many name variants which are used to refer to biological
	entities in the literature. {RESULTS}: {W}e present an approach to
	recognize named entities in full text. {T}he approach collects high
	frequency terms in an article, and uses support vector machines ({SVM})
	to identify biological entity names. {I}t is also computationally
	efficient and robust to noise commonly found in full text material.
	{W}e use the method to create a protein name dictionary from a set
	of 80,528 full text articles. {O}nly 8.3\% of the names in this dictionary
	match {S}wiss{P}rot description lines. {W}e assess the quality of
	the dictionary by studying its protein name recognition performance
	in full text. {CONCLUSION}: {T}his dictionary term lookup method
	compares favourably to other published methods, supporting the significance
	of our direct extraction approach. {T}he method is strong in recognizing
	name variants not found in {S}wiss{P}rot.},
  doi = {10.1186/1471-2105-6-88},
  pdf = {../local/Shi2005Building.pdf},
  file = {Shi2005Building.pdf:local/Shi2005Building.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-88},
  url = {http://dx.doi.org/10.1186/1471-2105-6-88}
}
@article{Shipp2002Diffuse,
  author = {Shipp, M. A. and Ross, K. N. and Tamayo, P. and Weng, A. P. and Kutok,
	J. L. and Aguiar, R. C. T. and Gaasenbeek, M. and Angelo, M. and
	Reich, M. and Pinkus, G. A. and Ray, T. S. and Koval, M. A. and Last,
	K. W. and Norton, A. and Lister, T. A. and Mesirov, J. and Neuberg,
	D. S. and Lander, E. S. and Aster, J. C. and Golub, T. R.},
  title = {Diffuse large {B}-cell lymphoma outcome prediction by gene-expression
	profiling and supervised machine learning},
  journal = {Nat. {M}ed.},
  year = {2002},
  volume = {8},
  pages = {68-74},
  number = {1},
  abstract = {Diffuse large {B}-cell lymphoma ({DLBCL}), the most common lymphoid
	malignancy in adults, is curable in less than 50% of patients. {P}rognostic
	models based on pre-treatment characteristics, such as the {I}nternational
	{P}rognostic {I}ndex ({IPI}), are currently used to predict outcome
	in {DLBCL}. {H}owever, clinical outcome models identify neither the
	molecular basis of clinical heterogeneity, nor specific therapeutic
	targets. {W}e analyzed the expression of 6,817 genes in diagnostic
	tumor specimens from {DLBCL} patients who received cyclophosphamide,
	adriamycin, vincristine and prednisone ({CHOP})-based chemotherapy,
	and applied a supervised learning prediction method to identify cured
	versus fatal or refractory disease. {T}he algorithm classified two
	categories of patients with very different five-year overall survival
	rates (70% versus 12%). {T}he model also effectively delineated patients
	within specific {IPI} risk categories who were likely to be cured
	or to die of their disease. {G}enes implicated in {DLBCL} outcome
	included some that regulate responses to {B}-cell?receptor signaling,
	critical serine/threonine phosphorylation pathways and apoptosis.
	{O}ur data indicate that supervised learning classification techniques
	can predict outcome in {DLBCL} and identify rational targets for
	intervention.},
  doi = {10.1038/nm0102-68},
  pdf = {../local/Shipp2002Diffuse.pdf},
  file = {Shipp2002Diffuse.pdf:local/Shipp2002Diffuse.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Shoeb2004Patient-specific,
  author = {Ali Shoeb and Herman Edwards and Jack Connolly and Blaise Bourgeois
	and S. Ted Treves and John Guttag},
  title = {Patient-specific seizure onset detection.},
  journal = {Epilepsy {B}ehav},
  year = {2004},
  volume = {5},
  pages = {483-98},
  number = {4},
  month = {Aug},
  abstract = {This article presents an automated, patient-specific method for the
	detection of epileptic seizure onset from noninvasive electroencephalography.
	{W}e adopt a patient-specific approach to exploit the consistency
	of an individual patient's seizure and nonseizure electroencephalograms.
	{O}ur method uses a wavelet decomposition to construct a feature
	vector that captures the morphology and spatial distribution of an
	electroencephalographic epoch, and then determines whether that vector
	is representative of a patient's seizure or nonseizure electroencephalogram
	using the support vector machine classification algorithm. {O}ur
	completely automated method was tested on noninvasive electroencephalograms
	from 36 pediatric subjects suffering from a variety of seizure types.
	{I}t detected 131 of 139 seizure events within 8.0+/-3.2 seconds
	of electrographic onset, and declared 15 false detections in 60 hours
	of clinical electroencephalography. {O}ur patient-specific method
	can be used to initiate delay-sensitive clinical procedures following
	seizure onset, for example, the injection of a functional imaging
	radiotracer.},
  doi = {10.1016/j.yebeh.2004.05.005},
  pdf = {../local/Shoeb2004Patient-specific.pdf},
  file = {Shoeb2004Patient-specific.pdf:local/Shoeb2004Patient-specific.pdf:PDF},
  keywords = {Algorithms, Comparative Study, Computational Biology, Computer-Assisted,
	Databases, Diagnosis, Drug Resistance, Electroencephalography, Epilepsy,
	Forecasting, Genetic, Genotype, HIV Protease Inhibitors, HIV-1, Humans,
	Information Management, Information Storage and Retrieval, Kinetics,
	Linear Models, Microbial Sensitivity Tests, Models, Monitoring, Non-U.S.
	Gov't, P.H.S., Periodicals, Physiologic, Point Mutation, Pyrimidinones,
	Reaction Time, Research Support, Reverse Transcriptase Inhibitors,
	Signal Processing, Theoretical, Time Factors, U.S. Gov't, Viral,
	15256184},
  pii = {S1525505004001593},
  url = {http://dx.doi.org/10.1016/j.yebeh.2004.05.005}
}
@article{Siepen2003Beta,
  author = {Siepen, J. A. and Radford, S. E. and Westhead, D. R.},
  title = {Beta {E}dge strands in protein structure prediction and aggregation},
  journal = {Protein {S}ci.},
  year = {2003},
  volume = {12},
  pages = {2348-2359},
  number = {10},
  abstract = {It is well established that recognition between exposed edges of {beta}-sheets
	is an important mode of protein-protein interaction and can have
	pathological consequences; for instance, it has been linked to the
	aggregation of proteins into a fibrillar structure, which is associated
	with a number of predominantly neurodegenerative disorders. {A} number
	of protective mechanisms have evolved in the edge strands of {beta}-sheets,
	preventing the aggregation and insolubility of most natural {beta}-sheet
	proteins. {S}uch mechanisms are unfavorable in the interior of a
	{beta}-sheet. {T}he problem of distinguishing edge strands from central
	strands based on sequence information alone is important in predicting
	residues and mutations likely to be involved in aggregation, and
	is also a first step in predicting folding topology. {H}ere we report
	support vector machine ({SVM}) and decision tree methods developed
	to classify edge strands from central strands in a representative
	set of protein domains. {I}nterestingly, rules generated by the decision
	tree method are in close agreement with our knowledge of protein
	structure and are potentially useful in a number of different biological
	applications. {W}hen trained on strands from proteins of known structure,
	using structure-based ({D}ictionary of {S}econdary {S}tructure in
	{P}roteins) strand assignments, both methods achieved mean cross-validated,
	prediction accuracies of ~78%. {T}hese accuracies were reduced when
	strand assignments from secondary structure prediction were used.
	{F}urther investigation of this effect revealed that it could be
	explained by a significant reduction in the accuracy of standard
	secondary structure prediction methods for edge strands, in comparison
	with central strands.},
  doi = {10.1110/ps.03234503},
  pdf = {../local/Siepen2003beta.pdf},
  file = {Siepen2003beta.pdf:local/Siepen2003beta.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.proteinscience.org/cgi/content/abstract/12/10/2348}
}
@article{Slanina2000Random,
  author = {Slanina, F. and Kotrla, M.},
  title = {Random networks created by biological evolution},
  journal = {Phys. {R}ev. {E}},
  year = {2000},
  volume = {62},
  pages = {6170-6177},
  number = {5},
  pdf = {../local/slan00.pdf},
  file = {slan00.pdf:local/slan00.pdf:PDF},
  subject = {bionet},
  url = {http://ojps.aip.org/getabs/servlet/GetabsServlet?prog=normal&id=PLEEE8000062000005006170000001&idtype=cvips&gifs=yes}
}
@article{Smith2004Towards,
  author = {P. A. Smith and M. J. Sorich and L. S C Low and R. A. McKinnon and
	J. O. Miners},
  title = {Towards integrated {ADME} prediction: past, present and future directions
	for modelling metabolism by {UDP}-glucuronosyltransferases.},
  journal = {J {M}ol {G}raph {M}odel},
  year = {2004},
  volume = {22},
  pages = {507-17},
  number = {6},
  month = {Jul},
  abstract = {Undesirable absorption, distribution, metabolism, excretion ({ADME})
	properties are the cause of many drug development failures and this
	has led to the need to identify such problems earlier in the development
	process. {T}his review highlights computational (in silico) approaches
	that have been used to identify the characteristics of ligands influencing
	molecular recognition and/or metabolism by the drug-metabolising
	enzyme {UDP}-gucuronosyltransferase ({UGT}). {C}urrent studies applying
	pharmacophore elucidation, 2{D}-quantitative structure metabolism
	relationships (2{D}-{QSMR}), 3{D}-quantitative structure metabolism
	relationships (3{D}-{QSMR}), and non-linear pattern recognition techniques
	such as artificial neural networks and support vector machines for
	modelling metabolism by {UGT} are reported. {A}n assessment of the
	utility of in silico approaches for the qualitative and quantitative
	prediction of drug glucuronidation parameters highlights the benefit
	of using multiple pharmacophores and also non-linear techniques for
	classification. {S}ome of the challenges facing the development of
	generalisable models for predicting metabolism by {UGT}, including
	the need for screening of more diverse structures, are also outlined.},
  doi = {10.1016/j.jmgm.2004.03.011},
  pdf = {../local/Smith2004Towards.pdf},
  file = {Smith2004Towards.pdf:local/Smith2004Towards.pdf:PDF},
  keywords = {Algorithms, Animals, Antisense, Artificial Intelligence, Astrocytoma,
	Automated, Autonomic Nervous System, Brain, Brain Neoplasms, Cell
	Line, Cerebral Cortex, Child, Cluster Analysis, Cognition, Comparative
	Study, Computational Biology, Computer Simulation, Computer-Assisted,
	DNA Fingerprinting, Databases, Diagnosis, Discriminant Analysis,
	Drug Design, Drug Evaluation, Electroencephalography, Emotions, Event-Related
	Potentials, Evoked Potentials, Factual, Fluorescence, Fuzzy Logic,
	Gene Silencing, Gene Targeting, Genetic, Glucuronosyltransferase,
	Hand, Hela Cells, Humans, Imaging, Intracellular Space, Magnetic
	Resonance Spectroscopy, Male, Meningeal Neoplasms, Meningioma, Microscopy,
	Models, Molecular Structure, Monitoring, Motor, Neoplasm Metastasis,
	Neoplasms, Neural Networks (Computer), Non-U.S. Gov't, Oligonucleotides,
	P.H.S., P300, Pattern Recognition, Peptides, Pharmaceutical Preparations,
	Physiologic, Preclinical, Predictive Value of Tests, Preschool, Prognosis,
	Protein Interaction Mapping, Protein Structure, Proteins, Proteomics,
	Quantitative Structure-Activity Relationship, Quaternary, RNA, RNA
	Interference, Recognition (Psychology), Reproducibility of Results,
	Research Support, Sensitivity and Specificity, Signal Processing,
	Small Interfering, Software, Thionucleotides, Three-Dimensional,
	Tumor, U.S. Gov't, User-Computer Interface, Word Processing, 15182810},
  pii = {S1093326304000269},
  url = {http://dx.doi.org/10.1016/j.jmgm.2004.03.011}
}
@techreport{Sole2001Model,
  author = {Sol{\'e}, R. V. and Pastor-Satorras, R. and Smith, E. D. and Kepler,
	T.},
  title = {A {M}odel of {L}arge-{S}cale {P}roteome {E}volution},
  institution = {Santa Fe Institute},
  year = {2001},
  note = {Working paper 01-08-041},
  pdf = {../local/sole01.pdf},
  file = {sole01.pdf:local/sole01.pdf:PDF},
  subject = {bionetprot},
  url = {http://www.santafe.edu/sfi/publications/Abstracts/01-08-041abs.html}
}
@article{Song2002Prediction,
  author = {Minghu Song and Curt M Breneman and Jinbo Bi and N. Sukumar and Kristin
	P Bennett and Steven Cramer and Nihal Tugcu},
  title = {Prediction of protein retention times in anion-exchange chromatography
	systems using support vector regression.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2002},
  volume = {42},
  pages = {1347-57},
  number = {6},
  abstract = {Quantitative {S}tructure-{R}etention {R}elationship ({QSRR}) models
	are developed for the prediction of protein retention times in anion-exchange
	chromatography systems. {T}opological, subdivided surface area, and
	{TAE} ({T}ransferable {A}tom {E}quivalent) electron-density-based
	descriptors are computed directly for a set of proteins using molecular
	connectivity patterns and crystal structure geometries. {A} novel
	algorithm based on {S}upport {V}ector {M}achine ({SVM}) regression
	has been employed to obtain predictive {QSRR} models using a two-step
	computational strategy. {I}n the first step, a sparse linear {SVM}
	was utilized as a feature selection procedure to remove irrelevant
	or redundant information. {S}ubsequently, the selected features were
	used to produce an ensemble of nonlinear {SVM} regression models
	that were combined using bootstrap aggregation (bagging) techniques,
	where various combinations of training and validation data sets were
	selected from the pool of available data. {A} visualization scheme
	(star plots) was used to display the relative importance of each
	selected descriptor in the final set of "bagged" models. {O}nce these
	predictive models have been validated, they can be used as an automated
	prediction tool for virtual high-throughput screening ({VHTS}).},
  keywords = {Acute, Algorithms, Animals, Anion Exchange Resins, Artificial Intelligence,
	Automated, Base Pair Mismatch, Base Pairing, Base Sequence, Biological,
	Biosensing Techniques, Carcinoma, Chemical, Chromatography, Classification,
	Cluster Analysis, Comparative Study, Computational Biology, Computer-Assisted,
	Cystadenoma, DNA, Decision Making, Diagnosis, Differential, Drug,
	Drug Design, Electrostatics, Eukaryotic Cells, Feasibility Studies,
	Female, Gene Expression, Gene Expression Profiling, Gene Expression
	Regulation, Genes, Genetic, Genetic Markers, Hemolysins, Humans,
	Internet, Ion Exchange, Leukemia, Ligands, Likelihood Functions,
	Logistic Models, Lung Neoplasms, Lymphocytic, Lymphoma, Markov Chains,
	Mathematics, Messenger, Models, Molecular, Molecular Probe Techniques,
	Molecular Sequence Data, Nanotechnology, Neoplasm, Neoplasms, Neoplastic,
	Neural Networks (Computer), Non-P.H.S., Non-Small-Cell Lung, Non-U.S.
	Gov't, Nucleic Acid Conformation, Nucleic Acid Hybridization, Observer
	Variation, Oligonucleotide Array Sequence Analysis, Ovarian Neoplasms,
	P.H.S., Pattern Recognition, Probability, Protein Binding, Protein
	Conformation, Proteins, Quality Control, Quantum Theory, RNA, RNA
	Splicing, Receptors, Reference Values, Regression Analysis, Reproducibility
	of Results, Research Support, Sensitivity and Specificity, Sequence
	Analysis, Signal Processing, Software, Statistical, Stomach Neoplasms,
	Thermodynamics, Transcription, Tumor Markers, U.S. Gov't, 12444731},
  pii = {ci025580t}
}
@inproceedings{Sonnenburg2002New,
  author = {Sonnenburg, S. and R{\"a}tsch, G. and Jagota, A. and M{\"u}ller,
	K.-R.},
  title = {New methods for splice-site recognition},
  booktitle = {Proc. {I}nternational conference on artificial {N}eural {N}etworks
	? {ICANN}?02},
  year = {2002},
  editor = {JR. Dorronsoro},
  number = {2415},
  series = {LNCS},
  pages = {329-336},
  publisher = {Springer Berlin},
  pdf = {../local/Sonnenburg2002New.pdf},
  file = {Sonnenburg2002New.pdf:local/Sonnenburg2002New.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Sorich2004Rapid,
  author = {Michael J Sorich and Ross A McKinnon and John O Miners and David
	A Winkler and Paul A Smith},
  title = {Rapid prediction of chemical metabolism by human {UDP}-glucuronosyltransferase
	isoforms using quantum chemical descriptors derived with the electronegativity
	equalization method.},
  journal = {J {M}ed {C}hem},
  year = {2004},
  volume = {47},
  pages = {5311-7},
  number = {21},
  month = {Oct},
  abstract = {This study aimed to evaluate in silico models based on quantum chemical
	({QC}) descriptors derived using the electronegativity equalization
	method ({EEM}) and to assess the use of {QC} properties to predict
	chemical metabolism by human {UDP}-glucuronosyltransferase ({UGT})
	isoforms. {V}arious {EEM}-derived {QC} molecular descriptors were
	calculated for known {UGT} substrates and nonsubstrates. {C}lassification
	models were developed using support vector machine and partial least
	squares discriminant analysis. {I}n general, the most predictive
	models were generated with the support vector machine. {C}ombining
	{QC} and 2{D} descriptors (from previous work) using a consensus
	approach resulted in a statistically significant improvement in predictivity
	(to 84\%) over both the {QC} and 2{D} models and the other methods
	of combining the descriptors. {EEM}-derived {QC} descriptors were
	shown to be both highly predictive and computationally efficient.
	{I}t is likely that {EEM}-derived {QC} properties will be generally
	useful for predicting {ADMET} and physicochemical properties during
	drug discovery.},
  doi = {10.1021/jm0495529},
  pdf = {../local/Sorich2004Rapid.pdf},
  file = {Sorich2004Rapid.pdf:local/Sorich2004Rapid.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/jm0495529}
}
@article{Sorich2003Comparison,
  author = {M. J. Sorich and J. O. Miners and R. A. McKinnon and D. A. Winkler
	and F. R. Burden and P. A. Smith},
  title = {Comparison of linear and nonlinear classification algorithms for
	the prediction of drug and chemical metabolism by human {UDP}-glucuronosyltransferase
	isoforms.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {2019-24},
  number = {6},
  abstract = {Partial least squares discriminant analysis ({PLSDA}), {B}ayesian
	regularized artificial neural network ({BRANN}), and support vector
	machine ({SVM}) methodologies were compared by their ability to classify
	substrates and nonsubstrates of 12 isoforms of human {UDP}-glucuronosyltransferase
	({UGT}), an enzyme "superfamily" involved in the metabolism of drugs,
	nondrug xenobiotics, and endogenous compounds. {S}imple two-dimensional
	descriptors were used to capture chemical information. {F}or each
	data set, 70\% of the data were used for training, and the remainder
	were used to assess the generalization performance. {I}n general,
	the {SVM} methodology was able to produce models with the best predictive
	performance, followed by {BRANN} and then {PLSDA}. {H}owever, a small
	number of data sets showed either equivalent or better predictability
	using {PLSDA}, which may indicate relatively linear relationships
	in these data sets. {A}ll {SVM} models showed predictive ability
	(>60\% of test set predicted correctly) and five out of the 12 test
	sets showed excellent prediction (>80\% prediction accuracy). {T}hese
	models represent the first use of pattern recognition methods to
	discriminate between substrates and nonsubstrates of human drug metabolizing
	enzymes and the first thorough assessment of three classification
	algorithms using multiple metabolic data sets.},
  doi = {10.1021/ci034108k},
  pdf = {../local/Sorich2003Comparison.pdf},
  file = {Sorich2003Comparison.pdf:local/Sorich2003Comparison.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci034108k}
}
@article{Stahura2004Virtual,
  author = {Florence L Stahura and Jürgen Bajorath},
  title = {Virtual screening methods that complement {HTS}.},
  journal = {Comb {C}hem {H}igh {T}hroughput {S}creen},
  year = {2004},
  volume = {7},
  pages = {259-69},
  number = {4},
  month = {Jun},
  abstract = {In this review, we discuss a number of computational methods that
	have been developed or adapted for molecule classification and virtual
	screening ({VS}) of compound databases. {I}n particular, we focus
	on approaches that are complementary to high-throughput screening
	({HTS}). {T}he discussion is limited to {VS} methods that operate
	at the small molecular level, which is often called ligand-based
	{VS} ({LBVS}), and does not take into account docking algorithms
	or other structure-based screening tools. {W}e describe areas that
	greatly benefit from combining virtual and biological screening and
	discuss computational methods that are most suitable to contribute
	to the integration of screening technologies. {R}elevant approaches
	range from established methods such as clustering or similarity searching
	to techniques that have only recently been introduced for {LBVS}
	applications such as statistical methods or support vector machines.
	{F}inally, we discuss a number of representative applications at
	the interface between {VS} and {HTS}.},
  keywords = {Algorithms, Animals, Antisense, Artificial Intelligence, Cell Line,
	Cluster Analysis, Comparative Study, Computational Biology, Computer
	Simulation, DNA Fingerprinting, Drug Evaluation, Fluorescence, Fuzzy
	Logic, Gene Silencing, Gene Targeting, Genetic, Hela Cells, Humans,
	Imaging, Intracellular Space, Microscopy, Models, Neoplasms, Neural
	Networks (Computer), Non-U.S. Gov't, Oligonucleotides, P.H.S., Preclinical,
	Prognosis, Proteomics, Quantitative Structure-Activity Relationship,
	RNA, RNA Interference, Research Support, Sensitivity and Specificity,
	Small Interfering, Thionucleotides, Three-Dimensional, Tumor, U.S.
	Gov't, 15200375}
}
@inproceedings{Stapley2002Predicting,
  author = {Stapley, B.J. and Kelley, L.A. and Sternberg, M.J.},
  title = {Predicting the sub-cellular location of proteins from text using
	support vector machines.},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002},
  year = {2002},
  editor = {Russ B. Altman and A. Keith Dunker and Lawrence Hunter and Kevin
	Lauerdale and Teri E. Klein},
  pages = {374-385},
  publisher = {World Scientific},
  abstract = {We present an automatic method to classify the sub-cellular location
	of proteins based on the text of relevant medline abstracts. {F}or
	each protein, a vector of terms is generated from medline abstracts
	in which the protein/gene's name or synonym occurs. {A} {S}upport
	{V}ector {M}achine ({SVM}) is used to automatically partition the
	term space and to thus discriminate the textual features that define
	sub-cellular location. {T}he method is benchmarked on a set of proteins
	of known sub-cellular location from {S}. cerevisiae. {N}o prior knowledge
	of the problem domain nor any natural language processing is used
	at any stage. {T}he method out-performs support vector machines trained
	on amino acid composition and has comparable performance to rule-based
	text classifiers. {C}ombining text with protein amino-acid composition
	improves recall for some sub-cellular locations. {W}e discuss the
	generality of the method and its potential application to a variety
	of biological classification problems.},
  pdf = {../local/Stapley2002Predicting.pdf},
  file = {Stapley2002Predicting.pdf:local/Stapley2002Predicting.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.smi.stanford.edu/projects/helix/psb02/stapley.pdf}
}
@article{Statnikov2004Methods,
  author = {Alexander Statnikov and Constantin F Aliferis and Ioannis Tsamardinos},
  title = {Methods for multi-category cancer diagnosis from gene expression
	data: a comprehensive evaluation to inform decision support system
	development.},
  journal = {Medinfo},
  year = {2004},
  volume = {11},
  pages = {813-7},
  number = {Pt 2},
  abstract = {Cancer diagnosis is a major clinical applications area of gene expression
	microarray technology. {W}e are seeking to develop a system for cancer
	diagnostic model creation based on microarray data. {I}n order to
	equip the system with the optimal combination of data modeling methods,
	we performed a comprehensive evaluation of several major classification
	algorithms, gene selection methods, and cross-validation designs
	using 11 datasets spanning 74 diagnostic categories (41 cancer types
	and 12 normal tissue types). {T}he {M}ulti-{C}ategory {S}upport {V}ector
	{M}achine techniques by {C}rammer and {S}inger, {W}eston and {W}atkins,
	and one-versus-rest were found to be the best methods and they outperform
	other learning algorithms such as {K}-{N}earest {N}eighbors and {N}eural
	{N}etworks often to a remarkable degree. {G}ene selection techniques
	are shown to significantly improve classification performance. {T}hese
	results guided the development of a software system that fully automates
	cancer diagnostic model construction with quality on par with or
	better than previously published results derived by expert human
	analysts.},
  keywords = {biosvm},
  pii = {D040004907}
}
@article{Statnikov2005comprehensive,
  author = {Statnikov, A. and Aliferis, C. F. and Tsamardinos, I. and Hardin,
	D. and Levy, S.},
  title = {A comprehensive evaluation of multicategory classification methods
	for microarray gene expression cancer diagnosis},
  journal = {Bioinformatics},
  year = {2005},
  note = {To appear},
  abstract = {Motivation: {C}ancer diagnosis is one of the most important emerging
	clinical applications of gene expression microarray technology. {W}e
	are seeking to develop a computer system for powerful and reliable
	cancer diagnostic model creation based on microarray data. {T}o keep
	a realistic perspective on clinical applications we focus on multicategory
	diagnosis. {I}n order to equip the system with the optimum combination
	of classifier, gene selection and cross-validation methods, we performed
	a systematic and comprehensive evaluation of several major algorithms
	for multicategory classification, several gene selection methods,
	multiple ensemble classifier methods, and two cross validation designs
	using 11 datasets spanning 74 diagnostic categories and 41 cancer
	types and 12 normal tissue types.{R}esults: {M}ulticategory {S}upport
	{V}ector {M}achines ({MC}-{SVM}s) are the most effective classifiers
	in performing accurate cancer diagnosis from gene expression data.
	{T}he {MC}-{SVM} techniques by {C}rammer and {S}inger, {W}eston and
	{W}atkins, and one-versus-rest were found to be the best methods
	in this domain. {MC}-{SVM}s outperform other popular machine learning
	algorithms such as {K}-{N}earest {N}eighbors, {B}ackpropagation and
	{P}robabilistic {N}eural {N}etworks, often to a remarkable degree.
	{G}ene selection techniques can significantly improve classification
	performance of both {MC}-{SVM}s and other non-{SVM} learning algorithms.
	{E}nsemble classifiers do not generally improve performance of the
	best non-ensemble models. {T}hese results guided the construction
	of a software system {GEMS} ({G}ene {E}xpression {M}odel {S}elector)
	that automates high-quality model construction and enforces sound
	optimization and performance estimation procedures. {T}his is the
	first such system to be informed by a rigorous comparative analysis
	of the available algorithms and datasets.{A}vailability: {T}he software
	system {GEMS} is available for download from http://www.gems-system.org
	for non-commercial use.},
  pdf = {../local/Statnikov2005comprehensive.pdf},
  file = {Statnikov2005comprehensive.pdf:local/Statnikov2005comprehensive.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/bti033v1}
}
@article{Steiner2004Discriminating,
  author = {Guido Steiner and Laura Suter and Franziska Boess and Rodolfo Gasser
	and Maria Cristina de Vera and Silvio Albertini and Stefan Ruepp},
  title = {Discriminating different classes of toxicants by transcript profiling.},
  journal = {Environ. {H}ealth {P}erspect.},
  year = {2004},
  volume = {112},
  pages = {1236-48},
  number = {12},
  month = {Aug},
  abstract = {Male rats were treated with various model compounds or the appropriate
	vehicle controls. {M}ost substances were either well-known hepatotoxicants
	or showed hepatotoxicity during preclinical testing. {T}he aim of
	the present study was to determine if biological samples from rats
	treated with various compounds can be classified based on gene expression
	profiles. {I}n addition to gene expression analysis using microarrays,
	a complete serum chemistry profile and liver and kidney histopathology
	were performed. {W}e analyzed hepatic gene expression profiles using
	a supervised learning method (support vector machines; {SVM}s) to
	generate classification rules and combined this with recursive feature
	elimination to improve classification performance and to identify
	a compact subset of probe sets with potential use as biomarkers.
	{T}wo different {SVM} algorithms were tested, and the models obtained
	were validated with a compound-based external cross-validation approach.
	{O}ur predictive models were able to discriminate between hepatotoxic
	and nonhepatotoxic compounds. {F}urthermore, they predicted the correct
	class of hepatotoxicant in most cases. {W}e provide an example showing
	that a predictive model built on transcript profiles from one rat
	strain can successfully classify profiles from another rat strain.
	{I}n addition, we demonstrate that the predictive models identify
	nonresponders and are able to discriminate between gene changes related
	to pharmacology and toxicity. {T}his work confirms the hypothesis
	that compound classification based on gene expression data is feasible.},
  pdf = {../local/Steiner2004Discriminating.pdf},
  file = {Steiner2004Discriminating.pdf:local/Steiner2004Discriminating.pdf:PDF},
  keywords = {biosvm},
  url = {http://ehp.niehs.nih.gov/txg/docs/2004/7036/abstract.html}
}
@article{Strahl2000language,
  author = {Strahl, B. D. and Allis, C. D.},
  title = {The language of covalent histone modifications},
  journal = {Nature},
  year = {2000},
  volume = {403},
  pages = {41--45},
  number = {6765},
  month = {Jan},
  abstract = {Histone proteins and the nucleosomes they form with DNA are the fundamental
	building blocks of eukaryotic chromatin. A diverse array of post-translational
	modifications that often occur on tail domains of these proteins
	has been well documented. Although the function of these highly conserved
	modifications has remained elusive, converging biochemical and genetic
	evidence suggests functions in several chromatin-based processes.
	We propose that distinct histone modifications, on one or more tails,
	act sequentially or in combination to form a 'histone code' that
	is, read by other proteins to bring about distinct downstream events.},
  doi = {10.1038/47412},
  institution = {Department of Biochemistry and Molecular Genetics, University of
	Virginia Health Science Center, Charlottesville 22908, USA.},
  keywords = {Acetylation; Amino Acid Sequence; Animals; Chromatin, physiology;
	Histones, chemistry/metabolism/physiology; Humans; Lysine, physiology;
	Microtubules, physiology; Models, Biological; Molecular Sequence
	Data; Phosphorylation; Protein Processing, Post-Translational; Serine,
	metabolism},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pmid = {10638745},
  timestamp = {2010.11.23},
  url = {http://dx.doi.org/10.1038/47412}
}
@article{Sturn2002Genesis:,
  author = {Alexander Sturn and John Quackenbush and Zlatko Trajanoski},
  title = {Genesis: cluster analysis of microarray data.},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {207-8},
  number = {1},
  month = {Jan},
  abstract = {A versatile, platform independent and easy to use {J}ava suite for
	large-scale gene expression analysis was developed. {G}enesis integrates
	various tools for microarray data analysis such as filters, normalization
	and visualization tools, distance measures as well as common clustering
	algorithms including hierarchical clustering, self-organizing maps,
	k-means, principal component analysis, and support vector machines.
	{T}he results of the clustering are transparent across all implemented
	methods and enable the analysis of the outcome of different algorithms
	and parameters. {A}dditionally, mapping of gene expression data onto
	chromosomal sequences was implemented to enhance promoter analysis
	and investigation of transcriptional control mechanisms.},
  keywords = {Algorithms, Artificial Intelligence, Cluster Analysis, Comparative
	Study, Computational Biology, Databases, Gene Expression Profiling,
	Genetic, Models, Molecular Structure, Neural Networks (Computer),
	Non-U.S. Gov't, Oligonucleotide Array Sequence Analysis, Principal
	Component Analysis, Programming Languages, Promoter Regions (Genetics),
	Protein, Proteins, Research Support, Software, Statistical, Transcription,
	11836235}
}
@article{Su2001Molecular,
  author = {Su, A. I. and Welsh, J. B. and Sapinoso, L. M. and Kern, S. G. and
	Dimitrov, P. and Lapp, H. and Schultz, P. G. and Powell, S. M. and
	Moskaluk, C. A. and Frierson, H. F.Jr. and Hampton, G. M.},
  title = {Molecular {C}lassification of {H}uman {C}arcinomas by {U}se of {G}ene
	{E}xpression {S}ignatures},
  journal = {Cancer {R}es.},
  year = {2001},
  volume = {61},
  pages = {7388-7393},
  number = {20},
  abstract = {Classification of human tumors according to their primary anatomical
	site of origin is fundamental for the optimal treatment of patients
	with cancer. {H}ere we describe the use of large-scale {RNA} profiling
	and supervised machine learning algorithms to construct a first-generation
	molecular classification scheme for carcinomas of the prostate, breast,
	lung, ovary, colorectum, kidney, liver, pancreas, bladder/ureter,
	and gastroesophagus, which collectively account for [~]70% of all
	cancer-related deaths in the {U}nited {S}tates. {T}he classification
	scheme was based on identifying gene subsets whose expression typifies
	each cancer class, and we quantified the extent to which these genes
	are characteristic of a specific tumor type by accurately and confidently
	predicting the anatomical site of tumor origin for 90% of 175 carcinomas,
	including 9 of 12 metastatic lesions. {T}he predictor gene subsets
	include those whose expression is typical of specific types of normal
	epithelial differentiation, as well as other genes whose expression
	is elevated in cancer. {T}his study demonstrates the feasibility
	of predicting the tissue origin of a carcinoma in the context of
	multiple cancer classes.},
  pdf = {../local/Su2001Molecular.pdf.html},
  file = {Su2001Molecular.pdf.html:local/Su2001Molecular.pdf.html:PDF},
  keywords = {biosvm, breastcancer},
  owner = {jeanphilippevert},
  url = {http://cancerres.aacrjournals.org/cgi/content/abstract/61/20/7388}
}
@article{Su2003RankGene,
  author = {Su, Yang and Murali, T.M. and Pavlovic, Vladimir and Schaffer, Michael
	and Kasif, Simon},
  title = {{{R}ank{G}ene}: identification of diagnostic genes based on expression
	data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1578-1579},
  number = {12},
  abstract = {Summary: {R}ank{G}ene is a program for analyzing gene expression data
	and computing diagnostic genes based on their predictive power in
	distinguishing between different types of samples. {T}he program
	integrates into one system a variety of popular ranking criteria,
	ranging from the traditional t-statistic to one-dimensional support
	vector machines. {T}his flexibility makes {R}ank{G}ene a useful tool
	in gene expression analysis and feature selection. {A}vailability:
	http://genomics10.bu.edu/yangsu/rankgene {C}ontact: murali@bu.edu},
  pdf = {../local/Su2003RankGene.pdf},
  file = {Su2003RankGene.pdf:local/Su2003RankGene.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/12/1578}
}
@article{Sun2003Identifying,
  author = {Sun, Y.F. and Fan, X.D. and Li, Y.D.},
  title = {Identifying splicing sites in eukaryotic {RNA}: support vector machine
	approach.},
  journal = {Comput. {B}iol. {M}ed.},
  year = {2003},
  volume = {33},
  pages = {17-29},
  number = {1},
  abstract = {We introduce a new method for splicing sites prediction based on the
	theory of support vector machines ({SVM}). {T}he {SVM} represents
	a new approach to supervised pattern classification and has been
	successfully applied to a wide range of pattern recognition problems.
	{I}n the process of splicing sites prediction, the statistical information
	of {RNA} secondary structure in the vicinity of splice sites, e.g.
	donor and acceptor sites, is introduced in order to compare recognition
	ratio of true positive and true negative. {F}rom the results of comparison,
	addition of structural information has brought no significant benefit
	for the recognition of splice sites and had even lowered the rate
	of recognition. {O}ur results suggest that, through three cross validation,
	the {SVM} method can achieve a good performance for splice sites
	identification.},
  doi = {10.1016/S0010-4825(02)00057-4},
  pdf = {../local/Sun2003Identifying.pdf},
  file = {Sun2003Identifying.pdf:local/Sun2003Identifying.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/S0010-4825(02)00057-4}
}
@article{Sun2004protein,
  author = {Zhenghong Sun and Xiaoli Fu and Lu Zhang and Xiaoli Yang and Feizhou
	Liu and Gengxi Hu},
  title = {A protein chip system for parallel analysis of multi-tumor markers
	and its application in cancer detection.},
  journal = {Anticancer {R}es},
  year = {2004},
  volume = {24},
  pages = {1159-65},
  number = {2C},
  abstract = {B{ACKGROUND}: {T}umor markers are routinely measured in clinical oncology.
	{H}owever, their value in cancer detection has been controversial
	largely because no single tumor marker is sensitive and specific
	enough to meet strict diagnostic criteria. {O}ne strategy to overcome
	the shortcomings of single tumor markers is to measure a combination
	of tumor markers to increase sensitivity and look for distinct patterns
	to increase specificity. {T}his study aimed to develop a system for
	parallel detection of tumor markers as a tool for tumor detection
	in both cancer patients and asymptomatic populations at high risk.
	{MATERIALS} {AND} {METHODS}: {A} protein chip was fabricated with
	twelve monoclonal antibodies against the following tumor markers
	respectively: {CA}125, {CA}15-3, {CA}19-9, {CA}242, {CEA}, {AFP},
	{PSA}, free-{PSA}, {HGH}, beta-{HCG}, {NSE} and ferritin. {T}umor
	markers were captured after the protein chip was incubated with serum
	samples. {A} secondary antibody conjugated with {HRP} was used to
	detect the captured tumor markers using chemiluminescence technique.
	{Q}uantification of the tumor markers was obtained after calibration
	with standard curves. {RESULTS}: {T}he chip system showed an overall
	sensitivity of 68.18\% after testing 1147 cancer patients, with high
	sensitivities for liver, pancreas and ovarian tumors and low sensitivities
	for gastrointestinal tumors, and a specificity of 97.1\% after testing
	793 healthy individuals. {A}pplication of the chip system in physical
	checkups of 15,867 individuals resulted in 16 cases that were subsequently
	confirmed as having cancers. {A}nalysis of the detection results
	with a {S}upport {V}ector {M}achine algorithm considerably increased
	the specificity of the system as reflected in healthy individuals
	and hepatitis/cirrhosis patients, but only modestly decreased the
	sensitivity for cancer patients. {CONCLUSION}: {T}his protein chip
	system is a potential tool for assisting cancer diagnosis and for
	screening cancer in high-risk populations.},
  keywords = {Antibodies, Artificial Intelligence, Biological, Calibration, Female,
	Horseradish Peroxidase, Humans, Male, Monoclonal, Neoplasms, Protein
	Array Analysis, Sensitivity and Specificity, Tumor Markers, 15154641}
}
@article{Sutherland2009Transcription,
  author = {Heidi Sutherland and Wendy A Bickmore},
  title = {Transcription factories: gene expression in unions?},
  journal = {Nat Rev Genet},
  year = {2009},
  volume = {10},
  pages = {457--466},
  number = {7},
  month = {Jul},
  abstract = {Transcription is a fundamental step in gene expression, yet it remains
	poorly understood at a cellular level. Visualization of transcription
	sites and active genes has led to the suggestion that transcription
	occurs at discrete sites in the nucleus, termed transcription factories,
	where multiple active RNA polymerases are concentrated and anchored
	to a nuclear substructure. However, this concept is not universally
	accepted. This Review discusses the experimental evidence in support
	of the transcription factory model and the evidence that argues against
	such a spatially structured view of transcription. The transcription
	factory model has implications for the regulation of transcription
	initiation and elongation, for the organization of genes in the genome,
	for the co-regulation of genes and for genome instability.},
  doi = {10.1038/nrg2592},
  institution = {MRC Human Genetics Unit, Institute of Genetics and Molecular Medicine,
	Crewe Road, Edinburgh EH4 2XU, UK.},
  keywords = {Animals; Cell Nucleus; DNA-Directed RNA Polymerases; Genome; Genomic
	Instability; Humans; Models, Biological; Transcription, Genetic},
  owner = {phupe},
  pii = {nrg2592},
  pmid = {19506577},
  timestamp = {2010.08.27},
  url = {http://dx.doi.org/10.1038/nrg2592}
}
@article{Suykens2001Optimal,
  author = {J. A. Suykens and J. Vandewalle and B. De Moor},
  title = {Optimal control by least squares support vector machines.},
  journal = {Neural {N}etw},
  year = {2001},
  volume = {14},
  pages = {23-35},
  number = {1},
  month = {Jan},
  abstract = {Support vector machines have been very successful in pattern recognition
	and function estimation problems. {I}n this paper we introduce the
	use of least squares support vector machines ({LS}-{SVM}'s) for the
	optimal control of nonlinear systems. {L}inear and neural full static
	state feedback controllers are considered. {T}he problem is formulated
	in such a way that it incorporates the {N}-stage optimal control
	problem as well as a least squares support vector machine approach
	for mapping the state space into the action space. {T}he solution
	is characterized by a set of nonlinear equations. {A}n alternative
	formulation as a constrained nonlinear optimization problem in less
	unknowns is given, together with a method for imposing local stability
	in the {LS}-{SVM} control scheme. {T}he results are discussed for
	support vector machines with radial basis function kernel. {A}dvantages
	of {LS}-{SVM} control are that no number of hidden units has to be
	determined for the controller and that no centers have to be specified
	for the {G}aussian kernels when applying {M}ercer's condition. {T}he
	curse of dimensionality is avoided in comparison with defining a
	regular grid for the centers in classical radial basis function networks.
	{T}his is at the expense of taking the trajectory of state variables
	as additional unknowns in the optimization problem, while classical
	neural network approaches typically lead to parametric optimization
	problems. {I}n the {SVM} methodology the number of unknowns equals
	the number of training data, while in the primal space the number
	of unknowns can be infinite dimensional. {T}he method is illustrated
	both on stabilization and tracking problems including examples on
	swinging up an inverted pendulum with local stabilization at the
	endpoint and a tracking problem for a ball and beam system.},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Artificial Intelligence, Automated, B-Lymphocytes, Bacterial Proteins,
	Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding Sites,
	Biological, Bone Marrow Cells, Cell Compartmentation, Chemistry,
	Child, Chromosome Aberrations, Comparative Study, Computational Biology,
	Computer Simulation, Computer-Assisted, DNA, Data Interpretation,
	Databases, Decision Trees, Diagnosis, Discriminant Analysis, Electric
	Conductivity, Electrophysiology, Escherichia coli Proteins, Factual,
	Feedback, Female, Fungal, Gastric Emptying, Gene Expression Profiling,
	Gene Expression Regulation, Genes, Genetic, Genetic Markers, Hemolysins,
	Humans, Ion Channels, Kinetics, Leukemia, Lipid Bilayers, Logistic
	Models, Lymphocytic, Male, Markov Chains, Melanoma, Models, Molecular,
	Myeloid, Neoplasm, Neoplastic, Neural Networks (Computer), Nevus,
	Non-P.H.S., Non-U.S. Gov't, Nonlinear Dynamics, Normal Distribution,
	Nucleic Acid Conformation, Organ Specificity, Organelles, P.H.S.,
	Pattern Recognition, Physical, Pigmented, Predictive Value of Tests,
	Promoter Regions (Genetics), Protein Folding, Protein Structure,
	Proteins, Proteome, RNA, Reproducibility of Results, Research Support,
	Saccharomyces cerevisiae, Secondary, Sensitivity and Specificity,
	Sequence Alignment, Sex Characteristics, Skin Diseases, Skin Neoplasms,
	Skin Pigmentation, Software, Statistical, Stomach Diseases, T-Lymphocytes,
	Thermodynamics, Transcription, Transcription Factors, Tumor Markers,
	U.S. Gov't, 11213211},
  pii = {S0893608000000770}
}
@article{Swamidass2005Kernels,
  author = {Swamidass, S. J. and Chen, J. and Bruand, J. and Phung, P. and Ralaivola,
	L. and Baldi, P.},
  title = {Kernels for small molecules and the prediction of mutagenicity, toxicity
	and anti-cancer activity.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i359-i368},
  number = {Suppl. 1},
  month = {Jun},
  abstract = {M{OTIVATION}: {S}mall molecules play a fundamental role in organic
	chemistry and biology. {T}hey can be used to probe biological systems
	and to discover new drugs and other useful compounds. {A}s increasing
	numbers of large datasets of small molecules become available, it
	is necessary to develop computational methods that can deal with
	molecules of variable size and structure and predict their physical,
	chemical and biological properties. {RESULTS}: {H}ere we develop
	several new classes of kernels for small molecules using their 1{D},
	2{D} and 3{D} representations. {I}n 1{D}, we consider string kernels
	based on {SMILES} strings. {I}n 2{D}, we introduce several similarity
	kernels based on conventional or generalized fingerprints. {G}eneralized
	fingerprints are derived by counting in different ways subpaths contained
	in the graph of bonds, using depth-first searches. {I}n 3{D}, we
	consider similarity measures between histograms of pairwise distances
	between atom classes. {T}hese kernels can be computed efficiently
	and are applied to problems of classification and prediction of mutagenicity,
	toxicity and anti-cancer activity on three publicly available datasets.
	{T}he results derived using cross-validation methods are state-of-the-art.
	{T}radeoffs between various kernels are briefly discussed. {AVAILABILITY}:
	{D}atasets available from http://www.igb.uci.edu/servers/servers.html
	{CONTACT}: pfbaldi@ics.uci.edu.},
  doi = {10.1093/bioinformatics/bti1055},
  pdf = {../local/Swamidass2005Kernels.pdf},
  file = {Swamidass2005Kernels.pdf:Swamidass2005Kernels.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i359},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1055}
}
@article{Taby2010Cancer,
  author = {Rodolphe Taby and Jean-Pierre J Issa},
  title = {Cancer epigenetics.},
  journal = {CA Cancer J Clin},
  year = {2010},
  volume = {60},
  pages = {376--392},
  number = {6},
  abstract = {Epigenetics refers to stable alterations in gene expression with no
	underlying modifications in the genetic sequence and is best exemplified
	by differentiation, in which multiple cell types diverge physiologically
	despite a common genetic code. Interest in this area of science has
	grown over the past decades, especially since it was found to play
	a major role in physiologic phenomena such as embryogenesis, imprinting,
	and X chromosome inactivation, and in disease states such as cancer.
	The latter had been previously thought of as a disease with an exclusive
	genetic etiology. However, recent data have demonstrated that the
	complexity of human carcinogenesis cannot be accounted for by genetic
	alterations alone, but also involves epigenetic changes in processes
	such as DNA methylation, histone modifications, and microRNA expression.
	In turn, these molecular alterations lead to permanent changes in
	the expression of genes that regulate the neoplastic phenotype, such
	as cellular growth and invasiveness. Targeting epigenetic modifiers
	has been referred to as epigenetic therapy. The success of this approach
	in hematopoietic malignancies validates the importance of epigenetic
	alterations in cancer, not only at the therapeutic level but also
	with regard to prevention, diagnosis, risk stratification, and prognosis.},
  doi = {10.3322/caac.20085},
  institution = {Department of Leukemia, The University of Texas M. D. Anderson Cancer
	Center, Houston, TX 77030, USA.},
  keywords = {Animals; Cell Cycle, genetics; Cell Transformation, Neoplastic, genetics;
	DNA Methylation; Epigenesis, Genetic; Histones, genetics; Humans;
	MicroRNAs, genetics; Neoplasm Invasiveness, genetics; Neoplasms,
	classification/diagnosis/genetics/metabolism/prevention /&/ control/therapy;
	Prognosis; Risk Assessment; Tumor Markers, Biological, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {caac.20085},
  pmid = {20959400},
  timestamp = {2011.06.04},
  url = {http://dx.doi.org/10.3322/caac.20085}
}
@article{Takaoka2003Development,
  author = {Y. Takaoka and Y. Endo and S. Yamanobe and H. Kakinuma and T. Okubo
	and Y. Shimazaki and T. Ota and S. Sumiya and K. Yoshikawa},
  title = {Development of a method for evaluating drug-likeness and ease of
	synthesis using a data set in which compounds are assigned scores
	based on chemists' intuition.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {1269-75},
  number = {4},
  abstract = {The concept of drug-likeness, an important characteristic for any
	compound in a screening library, is nevertheless difficult to pin
	down. {B}ased on our belief that this concept is implicit within
	the collective experience of working chemists, we devised a data
	set to capture an intuitive human understanding of both this characteristic
	and ease of synthesis, a second key characteristic. {F}ive chemists
	assigned a pair of scores to each of 3980 diverse compounds, with
	the component scores of each pair corresponding to drug-likeness
	and ease of synthesis, respectively. {U}sing this data set, we devised
	binary classifiers with an artificial neural network and a support
	vector machine. {T}hese models were found to efficiently eliminate
	compounds that are not drug-like and/or hard-to-synthesize derivatives,
	demonstrating the suitability of these models for use as compound
	acquisition filters.},
  doi = {10.1021/ci034043l},
  pdf = {../local/Takaoka2003Development.pdf},
  file = {Takaoka2003Development.pdf:local/Takaoka2003Development.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci034043l}
}
@article{Takeuchi2005Bio-medical,
  author = {Koichi Takeuchi and Nigel Collier},
  title = {Bio-medical entity extraction using support vector machines.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2005},
  volume = {33},
  pages = {125-37},
  number = {2},
  month = {Feb},
  abstract = {O{BJECTIVE}: {S}upport vector machines ({SVM}s) have achieved state-of-the-art
	performance in several classification tasks. {I}n this article we
	apply them to the identification and semantic annotation of scientific
	and technical terminology in the domain of molecular biology. {T}his
	illustrates the extensibility of the traditional named entity task
	to special domains with large-scale terminologies such as those in
	medicine and related disciplines. {METHODS} {AND} {MATERIALS}: {T}he
	foundation for the model is a sample of text annotated by a domain
	expert according to an ontology of concepts, properties and relations.
	{T}he model then learns to annotate unseen terms in new texts and
	contexts. {T}he results can be used for a variety of intelligent
	language processing applications. {W}e illustrate {SVM}s capabilities
	using a sample of 100 journal abstracts texts taken from the {human,
	blood cell, transcription factor} domain of {MEDLINE}. {RESULTS}:
	{A}pproximately 3400 terms are annotated and the model performs at
	about 74\% {F}-score on cross-validation tests. {A} detailed analysis
	based on empirical evidence shows the contribution of various feature
	sets to performance. {CONCLUSION}: {O}ur experiments indicate a relationship
	between feature window size and the amount of training data and that
	a combination of surface words, orthographic features and head noun
	features achieve the best performance among the feature sets tested.},
  doi = {10.1016/j.artmed.2004.07.019},
  pdf = {../local/Takeuchi2005Bio-medical.pdf},
  file = {Takeuchi2005Bio-medical.pdf:local/Takeuchi2005Bio-medical.pdf:PDF},
  keywords = {biosvm},
  pii = {S0933-3657(04)00130-7},
  url = {http://dx.doi.org/10.1016/j.artmed.2004.07.019}
}
@article{Tang2005Discovering,
  author = {Thomas Tang and Jinbo Xu and Ming Li},
  title = {Discovering sequence-structure motifs from protein segments and two
	applications.},
  journal = {Pac {S}ymp {B}iocomput},
  year = {2005},
  pages = {370-81},
  abstract = {We present a novel method for clustering short protein segments having
	strong sequence-structure correlations, and demonstrate that these
	clusters contain useful structural information via two applications.
	{W}hen applied to local tertiary structure prediction, we achieve
	approximately 60\% accuracy with a novel dynamic programming algorithm.
	{W}hen applied to secondary structure prediction based on {S}upport
	{V}ector {M}achines, we obtain a approximately 2\% gain in {Q}3 performance
	by incorporating cluster-derived data into training and classification.
	{T}hese encouraging results illustrate the great potential of using
	conserved local motifs to tackle protein structure predictions and
	possibly other important problems in biology.},
  keywords = {biosvm}
}
@article{Teramoto2005Prediction,
  author = {Reiji Teramoto and Mikio Aoki and Toru Kimura and Masaharu Kanaoka},
  title = {Prediction of si{RNA} functionality using generalized string kernel
	and support vector machine.},
  journal = {F{EBS} {L}ett.},
  year = {2005},
  volume = {579},
  pages = {2878-82},
  number = {13},
  month = {May},
  abstract = {Small interfering {RNA}s (si{RNA}s) are becoming widely used for sequence-specific
	gene silencing in mammalian cells, but designing an effective si{RNA}
	is still a challenging task. {I}n this study, we developed an algorithm
	for predicting si{RNA} functionality by using generalized string
	kernel ({GSK}) combined with support vector machine ({SVM}). {W}ith
	{GSK}, si{RNA} sequences were represented as vectors in a multi-dimensional
	feature space according to the numbers of subsequences in each si{RNA},
	and subsequently classified with {SVM} into effective or ineffective
	si{RNA}s. {W}e applied this algorithm to published si{RNA}s, and
	could classify effective and ineffective si{RNA}s with 90.6\%, 86.2\%
	accuracy, respectively.},
  doi = {10.1016/j.febslet.2005.04.045},
  pdf = {../local/Teramoto2005Prediction.pdf},
  file = {Teramoto2005Prediction.pdf:local/Teramoto2005Prediction.pdf:PDF},
  keywords = {sirna biosvm},
  pii = {S0014-5793(05)00520-X},
  url = {http://dx.doi.org/10.1016/j.febslet.2005.04.045}
}
@article{Terentiev2009Dynamic,
  author = {A. A. Terentiev and N. T. Moldogazieva and K. V. Shaitan},
  title = {Dynamic proteomics in modeling of the living cell. Protein-protein
	interactions.},
  journal = {Biochemistry (Mosc)},
  year = {2009},
  volume = {74},
  pages = {1586--1607},
  number = {13},
  month = {Dec},
  abstract = {This review is devoted to describing, summarizing, and analyzing of
	dynamic proteomics data obtained over the last few years and concerning
	the role of protein-protein interactions in modeling of the living
	cell. Principles of modern high-throughput experimental methods for
	investigation of protein-protein interactions are described. Systems
	biology approaches based on integrative view on cellular processes
	are used to analyze organization of protein interaction networks.
	It is proposed that finding of some proteins in different protein
	complexes can be explained by their multi-modular and polyfunctional
	properties; the different protein modules can be located in the nodes
	of protein interaction networks. Mathematical and computational approaches
	to modeling of the living cell with emphasis on molecular dynamics
	simulation are provided. The role of the network analysis in fundamental
	medicine is also briefly reviewed.},
  institution = {Russian State Medical University, ul. Ostrovityanova 1, Moscow, Russia.
	aaterent@mtu-net.ru},
  keywords = {Animals; Humans; Mass Spectrometry; Models, Theoretical; Molecular
	Dynamics Simulation; Multiprotein Complexes; Protein Conformation;
	Protein Interaction Mapping; Proteins; Proteomics; Systems Biology;
	Two-Hybrid System Techniques},
  owner = {phupe},
  pii = {BCM74131586},
  pmid = {20210711},
  timestamp = {2010.08.31}
}
@article{Thukral2005Prediction,
  author = {Sushil K Thukral and Paul J Nordone and Rong Hu and Leah Sullivan
	and Eric Galambos and Vincent D Fitzpatrick and Laura Healy and Michael
	B Bass and Mary E Cosenza and Cynthia A Afshari},
  title = {Prediction of nephrotoxicant action and identification of candidate
	toxicity-related biomarkers.},
  journal = {Toxicol {P}athol},
  year = {2005},
  volume = {33},
  pages = {343-55},
  number = {3},
  abstract = {A vast majority of pharmacological compounds and their metabolites
	are excreted via the urine, and within the complex structure of the
	kidney,the proximal tubules are a main target site of nephrotoxic
	compounds. {W}e used the model nephrotoxicants mercuric chloride,
	2-bromoethylamine hydrobromide, hexachlorobutadiene, mitomycin, amphotericin,
	and puromycin to elucidate time- and dose-dependent global gene expression
	changes associated with proximal tubular toxicity. {M}ale {S}prague-{D}awley
	rats were dosed via intraperitoneal injection once daily for mercuric
	chloride and amphotericin (up to 7 doses), while a single dose was
	given for all other compounds. {A}nimals were exposed to 2 different
	doses of these compounds and kidney tissues were collected on day
	1, 3, and 7 postdosing. {G}ene expression profiles were generated
	from kidney {RNA} using 17{K} rat c{DNA} dual dye microarray and
	analyzed in conjunction with histopathology. {A}nalysis of gene expression
	profiles showed that the profiles clustered based on similarities
	in the severity and type of pathology of individual animals. {F}urther,
	the expression changes were indicative of tubular toxicity showing
	hallmarks of tubular degeneration/regeneration and necrosis. {U}se
	of gene expression data in predicting the type of nephrotoxicity
	was then tested with a support vector machine ({SVM})-based approach.
	{A} {SVM} prediction module was trained using 120 profiles of total
	profiles divided into four classes based on the severity of pathology
	and clustering. {A}lthough mitomycin {C} and amphotericin {B} treatments
	did not cause toxicity, their expression profiles were included in
	the {SVM} prediction module to increase the sample size. {U}sing
	this classifier, the {SVM} predicted the type of pathology of 28
	test profiles with 100\% selectivity and 82\% sensitivity. {T}hese
	data indicate that valid predictions could be made based on gene
	expression changes from a small set of expression profiles. {A} set
	of potential biomarkers showing a time- and dose-response with respect
	to the progression of proximal tubular toxicity were identified.
	{T}hese include several transporters ({S}lc21a2, {S}lc15, {S}lc34a2),
	{K}im 1, {IGF}bp-1, osteopontin, alpha-fibrinogen, and {G}stalpha.},
  doi = {10.1080/01926230590927230},
  keywords = {Algorithms, Animals, Antibiotics, Antineoplastic, Artificial Intelligence,
	Butadienes, Chloroplasts, Comparative Study, Computer Simulation,
	Computer-Assisted, Diagnosis, Disinfectants, Dose-Response Relationship,
	Drug, Drug Toxicity, Electrodes, Electroencephalography, Ethylamines,
	Expert Systems, Feedback, Fungicides, Gene Expression Profiling,
	Genes, Genetic Markers, Humans, Implanted, Industrial, Information
	Storage and Retrieval, Kidney, Kidney Tubules, MEDLINE, Male, Mercuric
	Chloride, Microarray Analysis, Molecular Biology, Motor Cortex, Movement,
	Natural Language Processing, Neural Networks (Computer), Non-P.H.S.,
	Non-U.S. Gov't, Plant Proteins, Predictive Value of Tests, Proteins,
	Proteome, Proximal, Puromycin Aminonucleoside, Rats, Reproducibility
	of Results, Research Support, Sprague-Dawley, Subcellular Fractions,
	Terminology, Therapy, Time Factors, Toxicogenetics, U.S. Gov't, User-Computer
	Interface, 15805072},
  pii = {X3U2206L2747H31G},
  url = {http://dx.doi.org/10.1080/01926230590927230}
}
@article{Tobita2005discriminant,
  author = {Tobita, M. and Nishikawa, T. and Nagashima, R.},
  title = {A discriminant model constructed by the support vector machine method
	for {HERG} potassium channel inhibitors.},
  journal = {Bioorg. {M}ed. {C}hem. {L}ett.},
  year = {2005},
  volume = {15},
  pages = {2886-90},
  number = {11},
  month = {Jun},
  abstract = {H{ERG} attracts attention as a risk factor for arrhythmia, which might
	trigger torsade de pointes. {A} highly accurate classifier of chemical
	compounds for inhibition of the {HERG} potassium channel is constructed
	using support vector machine. {F}or two test sets, our discriminant
	models achieved 90\% and 95\% accuracy, respectively. {T}he classifier
	is even applied for the prediction of cardio vascular adverse effects
	to achieve about 70\% accuracy. {W}hile modest inhibitors are partly
	characterized by properties linked to global structure of a molecule
	including hydrophobicity and diameter, strong inhibitors are exclusively
	characterized by properties linked to substructures of a molecule.},
  doi = {10.1016/j.bmcl.2005.03.080},
  pdf = {../local/Tobita2005discriminant.pdf},
  file = {Tobita2005discriminant.pdf:local/Tobita2005discriminant.pdf:PDF},
  keywords = {biosvm chemoinformatics herg},
  pii = {S0960-894X(05)00403-8},
  url = {http://dx.doi.org/10.1016/j.bmcl.2005.03.080}
}
@article{Tomizaki2010Protein,
  author = {{Kin-ya} Tomizaki and Kenji Usui and Hisakazu Mihara},
  title = {Protein-protein interactions and selection: array-based techniques
	for screening disease-associated biomarkers in predictive/early diagnosis.},
  journal = {FEBS J},
  year = {2010},
  volume = {277},
  pages = {1996--2005},
  number = {9},
  month = {May},
  abstract = {There has been considerable interest in recent years in the development
	of miniaturized and parallelized array technology for protein-protein
	interaction analysis and protein profiling, namely 'protein-detecting
	microarrays'. Protein-detecting microarrays utilize a wide variety
	of capture agents (antibodies, fusion proteins, DNA/RNA aptamers,
	synthetic peptides, carbohydrates, and small molecules) immobilized
	at high spatial density on a solid surface. Each capture agent binds
	selectively to its target protein in a complex mixture, such as serum
	or cell lysate samples. Captured proteins are subsequently detected
	and quantified in a high-throughput fashion, with minimal sample
	consumption. Protein-detecting microarrays were first described by
	MacBeath and Schreiber in 2000, and the number of publications involving
	this technology is rapidly increasing. Furthermore, the first multiplex
	immunoassay systems have been cleared by the US Food and Drug Administration,
	signaling recognition of the usefulness of miniaturized and parallelized
	array technology for protein detection in predictive/early diagnosis.
	Although genetic tests still predominate, with further development
	protein-based diagnosis will become common in clinical use within
	a few years.},
  doi = {10.1111/j.1742-4658.2010.07626.x},
  institution = {Innovative Materials and Processing Research Center and Department
	of Materials Chemistry, Ryukoku University, Otsu, Japan.},
  keywords = {Animals; Biological Markers, analysis/metabolism; Early Diagnosis;
	Humans; Mass Screening, methods; Protein Array Analysis, methods;
	Proteins, analysis/metabolism; Risk Factors},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {EJB7626},
  pmid = {20412053},
  timestamp = {2010.07.28},
  url = {http://dx.doi.org/10.1111/j.1742-4658.2010.07626.x}
}
@article{Tothill2005expression-based,
  author = {Richard W Tothill and Adam Kowalczyk and Danny Rischin and Alex Bousioutas
	and Izhak Haviv and Ryan K van Laar and Paul M Waring and John Zalcberg
	and Robyn Ward and Andrew V Biankin and Robert L Sutherland and Susan
	M Henshall and Kwun Fong and Jonathan R Pollack and David D L Bowtell
	and Andrew J Holloway},
  title = {An expression-based site of origin diagnostic method designed for
	clinical application to cancer of unknown origin.},
  journal = {Cancer {R}es.},
  year = {2005},
  volume = {65},
  pages = {4031-40},
  number = {10},
  month = {May},
  abstract = {Gene expression profiling offers a promising new technique for the
	diagnosis and prognosis of cancer. {W}e have applied this technology
	to build a clinically robust site of origin classifier with the ultimate
	aim of applying it to determine the origin of cancer of unknown primary
	({CUP}). {A} single c{DNA} microarray platform was used to profile
	229 primary and metastatic tumors representing 14 tumor types and
	multiple histologic subtypes. {T}his data set was subsequently used
	for training and validation of a support vector machine ({SVM}) classifier,
	demonstrating 89\% accuracy using a 13-class model. {F}urther, we
	show the translation of a five-class classifier to a quantitative
	{PCR}-based platform. {S}electing 79 optimal gene markers, we generated
	a quantitative-{PCR} low-density array, allowing the assay of both
	fresh-frozen and formalin-fixed paraffin-embedded ({FFPE}) tissue.
	{D}ata generated using both quantitative {PCR} and microarray were
	subsequently used to train and validate a cross-platform {SVM} model
	with high prediction accuracy. {F}inally, we applied our {SVM} classifiers
	to 13 cases of {CUP}. {W}e show that the microarray {SVM} classifier
	was capable of making high confidence predictions in 11 of 13 cases.
	{T}hese predictions were supported by comprehensive review of the
	patients' clinical histories.},
  doi = {10.1158/0008-5472.CAN-04-3617},
  pdf = {../local/Tothill2005expression-based.pdf},
  file = {Tothill2005expression-based.pdf:Tothill2005expression-based.pdf:PDF},
  keywords = {biosvm microarray},
  pii = {65/10/4031},
  url = {http://dx.doi.org/10.1158/0008-5472.CAN-04-3617}
}
@article{Tsai2004Gene,
  author = {Tsai, C.A. and Chen, C.H. and Lee, T.C. and Ho, I.C. and Yang, U.C.
	and Chen, J.J.},
  title = {Gene selection for sample classifications in microarray experiments.},
  journal = {D{NA} {C}ell {B}iol.},
  year = {2004},
  volume = {23},
  pages = {607-614},
  number = {10},
  abstract = {D{NA} microarray technology provides useful tools for profiling global
	gene expression patterns in different cell/tissue samples. {O}ne
	major challenge is the large number of genes relative to the number
	of samples. {T}he use of all genes can suppress or reduce the performance
	of a classification rule due to the noise of nondiscriminatory genes.
	{S}election of an optimal subset from the original gene set becomes
	an important prestep in sample classification. {I}n this study, we
	propose a family-wise error ({FWE}) rate approach to selection of
	discriminatory genes for two-sample or multiple-sample classification.
	{T}he {FWE} approach controls the probability of the number of one
	or more false positives at a prespecified level. {A} public colon
	cancer data set is used to evaluate the performance of the proposed
	approach for the two classification methods: k nearest neighbors
	(k-{NN}) and support vector machine ({SVM}). {T}he selected gene
	sets from the proposed procedure appears to perform better than or
	comparable to several results reported in the literature using the
	univariate analysis without performing multivariate search. {I}n
	addition, we apply the {FWE} approach to a toxicogenomic data set
	with nine treatments (a control and eight metals, {A}s, {C}d, {N}i,
	{C}r, {S}b, {P}b, {C}u, and {A}s{V}) for a total of 55 samples for
	a multisample classification. {T}wo gene sets are considered: the
	gene set omega{F} formed by the {ANOVA} {F}-test, and a gene set
	omega{T} formed by the union of one-versus-all t-tests. {T}he predicted
	accuracies are evaluated using the internal and external crossvalidation.
	{U}sing the {SVM} classification, the overall accuracies to predict
	55 samples into one of the nine treatments are above 80% for internal
	crossvalidation. {O}mega{F} has slightly higher accuracy rates than
	omega{T}. {T}he overall predicted accuracies are above 70% for the
	external crossvalidation; the two gene sets omega{T} and omega{F}
	performed equally well.},
  doi = {10.1089/1044549042476947},
  pdf = {../local/Tsai2004Gene.pdf},
  file = {Tsai2004Gene.pdf:local/Tsai2004Gene.pdf:PDF},
  keywords = {biosvm microarray},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1089/1044549042476947}
}
@article{Tsirigos2005sensitive,
  author = {Tsirigos, A. and Rigoutsos, I.},
  title = {A sensitive, support-vector-machine method for the detection of horizontal
	gene transfers in viral, archaeal and bacterial genomes.},
  journal = {Nucleic {A}cids {R}es.},
  year = {2005},
  volume = {33},
  pages = {3699-707},
  number = {12},
  abstract = {In earlier work, we introduced and discussed a generalized computational
	framework for identifying horizontal transfers. {T}his framework
	relied on a gene's nucleotide composition, obviated the need for
	knowledge of codon boundaries and database searches, and was shown
	to perform very well across a wide range of archaeal and bacterial
	genomes when compared with previously published approaches, such
	as {C}odon {A}daptation {I}ndex and {C} + {G} content. {N}onetheless,
	two considerations remained outstanding: we wanted to further increase
	the sensitivity of detecting horizontal transfers and also to be
	able to apply the method to increasingly smaller genomes. {I}n the
	discussion that follows, we present such a method, {W}n-{SVM}, and
	show that it exhibits a very significant improvement in sensitivity
	compared with earlier approaches. {W}n-{SVM} uses a one-class support-vector
	machine and can learn using rather small training sets. {T}his property
	makes {W}n-{SVM} particularly suitable for studying small-size genomes,
	similar to those of viruses, as well as the typically larger archaeal
	and bacterial genomes. {W}e show experimentally that the new method
	results in a superior performance across a wide range of organisms
	and that it improves even upon our own earlier method by an average
	of 10\% across all examined genomes. {A}s a small-genome case study,
	we analyze the genome of the human cytomegalovirus and demonstrate
	that {W}n-{SVM} correctly identifies regions that are known to be
	conserved and prototypical of all beta-herpesvirinae, regions that
	are known to have been acquired horizontally from the human host
	and, finally, regions that had not up to now been suspected to be
	horizontally transferred. {A}typical region predictions for many
	eukaryotic viruses, including the alpha-, beta- and gamma-herpesvirinae,
	and 123 archaeal and bacterial genomes, have been made available
	online at http://cbcsrv.watson.ibm.com/{HGT}_{SVM}/.},
  doi = {10.1093/nar/gki660},
  pdf = {../local/Tsirigos2005sensitive.pdf},
  file = {Tsirigos2005sensitive.pdf:local/Tsirigos2005sensitive.pdf:PDF},
  keywords = {biosvm},
  pii = {33/12/3699},
  url = {http://dx.doi.org/10.1093/nar/gki660}
}
@article{Tsuda2003em,
  author = {Tsuda, K. and Akaho, S. and Asai, K.},
  title = {The em {A}lgorithm for {K}ernel {M}atrix {C}ompletion with {A}uxiliary
	{D}ata},
  journal = {J. {M}ach. {L}earn. {R}es.},
  year = {2003},
  volume = {4},
  pages = {67-81},
  abstract = {In biological data, it is often the case that observed data are available
	only for a subset of samples. {W}hen a kernel matrix is derived from
	such data, we have to leave the entries for unavailable samples as
	missing. {I}n this paper, the missing entries are completed by exploiting
	an auxiliary kernel matrix derived from another information source.
	{T}he parametric model of kernel matrices is created as a set of
	spectral variants of the auxiliary kernel matrix, and the missing
	entries are estimated by fitting this model to the existing entries.
	{F}or model fitting, we adopt the em algorithm (distinguished from
	the {EM} algorithm of {D}empster et al., 1977) based on the information
	geometry of positive definite matrices. {W}e will report promising
	results on bacteria clustering experiments using two marker sequences:
	16{S} and gyr{B}.},
  pdf = {../local/Tsuda2003em.pdf},
  file = {Tsuda2003em.pdf:local/Tsuda2003em.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.jmlr.org/papers/v4/tsuda03a.html}
}
@article{Tsuda2002new,
  author = {K. Tsuda and M. Kawanabe and G. R{\"a}tsch and S. Sonnenburg and
	K.-R. M{\"u}ller},
  title = {A new discriminative kernel from probabilistic models},
  journal = {Neural {C}omputation},
  year = {2002},
  volume = {14},
  pages = {2397--2414},
  number = {10},
  doi = {10.1162/08997660260293274},
  pdf = {../local/Tsuda2002new.pdf},
  file = {Tsuda2002new.pdf:local/Tsuda2002new.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1162/08997660260293274}
}
@article{Tsuda2002Marginalized,
  author = {K. Tsuda and T. Kin and K. Asai},
  title = {Marginalized {K}ernels for {B}iological {S}equences},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {S268--S275},
  abstract = {Motivation: {K}ernel methods such as support vector machines require
	a kernel function between objects to be defined a priori. {S}everal
	works have been done to derive kernels from probability distributions,
	e.g., the {F}isher kernel. {H}owever, a general methodology to design
	a kernel is not fully developed. {R}esults: {W}e propose a reasonable
	way of designing a kernel when objects are generated from latent
	variable models (e.g., {HMM}). {F}irst of all, a joint kernel is
	designed for complete data which include both visible and hidden
	variables. {T}hen a marginalized kernel for visible data is obtained
	by taking the expectation with respect to hidden variables. {W}e
	will show that the {F}isher kernel is a special case of marginalized
	kernels, which gives another viewpoint to the {F}isher kernel theory.
	{A}lthough our approach can be applied to any object, we particularly
	derive several marginalized kernels useful for biological sequences
	(e.g., {DNA} and proteins). {T}he effectiveness of marginalized kernels
	is illustrated in the task of classifying bacterial gyrase subunit
	{B} (gyr{B}) amino acid sequences.},
  comment = {Introduces the idea of marginalized kernel. Show that the Fisher kernel
	is a particular case of it, and modify it. Application to bacterial
	gyrB classification.},
  pdf = {../local/Tsuda2002Marginalized.pdf},
  file = {Tsuda2002Marginalized.pdf:local/Tsuda2002Marginalized.pdf:PDF},
  keywords = {biosvm}
}
@article{Tsuda2004Learning,
  author = {Tsuda, K. and Noble, W.S.},
  title = {Learning kernels from biological networks by maximizing entropy},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {i326--i333},
  abstract = {Motivation: {T}he diffusion kernel is a general method for computing
	pairwise distances among all nodes in a graph, based on the sum of
	weighted paths between each pair of nodes. {T}his technique has been
	used successfully, in conjunction with kernel-based learning methods,
	to draw inferences from several types of biological networks. {R}esults:
	{W}e show that computing the diffusion kernel is equivalent to maximizing
	the von {N}eumann entropy, subject to a global constraint on the
	sum of the {E}uclidean distances between nodes. {T}his global constraint
	allows for high variance in the pairwise distances. {A}ccordingly,
	we propose an alternative, locally constrained diffusion kernel,
	and we demonstrate that the resulting kernel allows for more accurate
	support vector machine prediction of protein functional classifications
	from metabolic and protein?protein interaction networks. {A}vailability:
	{S}upplementary results and data are available at noble.gs.washington.edu/proj/maxent},
  comment = {Problem = multiclass classification of tumor cells from gene expression.
	Show that the one-versus-all approach of combining SVM yields the
	minimum number of classification errors on their Affymetrix data
	with 14 tumor types. In addition to not taking variability estimates
	of repeated measurements into account, this approach selects different
	relevant features (genes) for each binary classifier.},
  doi = {10.1093/bioinformatics/bth906},
  pdf = {../local/Tsuda2004Learning.pdf},
  file = {Tsuda2004Learning.pdf:local/Tsuda2004Learning.pdf:PDF},
  keywords = {learning-kernel graph-kernel biosvm},
  owner = {vert},
  url = {http://dx.doi.org/10.1093/bioinformatics/bth906}
}
@article{Uetz2000comprehensive,
  author = {Uetz, P. and Giot, L. and Cagney, G. and Mansfield, T. A. and Judson,
	R. S. and Knight, J. R. and Lockshon, D. and Narayan, V. and Srinivasan,
	M. and Pochart, P. and Qureshi-Emili, A. and Li, Y. and Godwin, B.
	and Conover, D. and Kalbfleish, T. and Vijayadamodar, G. and Yang,
	M. and Johnston, M. and Fields, S. and Rothberg, J. M.},
  title = {A comprehensive analysis of protein-protein interactions in {S}accharomyces
	cerevisiae},
  journal = {Nature},
  year = {2000},
  volume = {403},
  pages = {623--627},
  pdf = {../local/uetz00.pdf},
  file = {uetz00.pdf:local/uetz00.pdf:PDF},
  subject = {bionet},
  url = {http://www.nature.com/cgi-taf/DynaPage.taf?file=/nature/journal/v403/n6770/full/403623a0_fs.html&content_filetype=pdf}
}
@article{Valentini2002Gene,
  author = {Valentini, G.},
  title = {Gene expression data analysis of human lymphoma using support vector
	machines and output coding ensembles.},
  journal = {Artif. {I}ntell. {M}ed.},
  year = {2002},
  volume = {26},
  pages = {281-304},
  number = {3},
  month = {Nov},
  abstract = {The large amount of data generated by {DNA} microarrays was originally
	analysed using unsupervised methods, such as clustering or self-organizing
	maps. {R}ecently supervised methods such as decision trees, dot-product
	support vector machines ({SVM}) and multi-layer perceptrons ({MLP})
	have been applied in order to classify normal and tumoural tissues.
	{W}e propose methods based on non-linear {SVM} with polynomial and
	{G}aussian kernels, and output coding ({OC}) ensembles of learning
	machines to separate normal from malignant tissues, to classify different
	types of lymphoma and to analyse the role of sets of coordinately
	expressed genes in carcinogenic processes of lymphoid tissues. {U}sing
	gene expression data from "{L}ymphochip", a specialised {DNA} microarray
	developed at {S}tanford {U}niversity {S}chool of {M}edicine, we show
	that {SVM} can correctly separate normal from tumoural tissues, and
	{OC} ensembles can be successfully used to classify different types
	of lymphoma. {M}oreover, we identify a group of coordinately expressed
	genes related to the separation of two distinct subgroups inside
	diffuse large {B}-cell lymphoma ({DLBCL}), validating a previous
	{A}lizadeh's hypothesis about the existence of two distinct diseases
	inside {DLBCL}.},
  doi = {10.1016/S0933-3657(02)00077},
  pdf = {../local/Valentini2002Gene.pdf},
  file = {Valentini2002Gene.pdf:local/Valentini2002Gene.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Vallabhaneni2004Motor,
  author = {Anirudh Vallabhaneni and Bin He},
  title = {Motor imagery task classification for brain computer interface applications
	using spatiotemporal principle component analysis.},
  journal = {Neurol {R}es},
  year = {2004},
  volume = {26},
  pages = {282-7},
  number = {3},
  month = {Apr},
  abstract = {Classification of single-trial imagined left- and right-hand movements
	recorded through scalp {EEG} are explored in this study. {C}lassical
	event-related desynchronization/synchronization ({ERD}/{ERS}) calculation
	approach was utilized to extract {ERD} features from the raw scalp
	{EEG} signal. {P}rinciple {C}omponent {A}nalysis ({PCA}) was used
	for feature extraction and applied on spatial, as well as temporal
	dimensions in two consecutive steps. {A} {S}upport {V}ector {M}achine
	({SVM}) classifier using a linear decision function was used to classify
	each trial as either left or right. {T}he present approach has yielded
	good classification results and promises to have potential for further
	refinement for increased accuracy as well as application in online
	brain computer interface ({BCI}).},
  doi = {10.1179/016164104225013950},
  keywords = {Amino Acids, Antibodies, Artificial Intelligence, Biological, Brain,
	Brain Mapping, Calibration, Comparative Study, Computational Biology,
	Cysteine, Cystine, Electrodes, Electroencephalography, Evoked Potentials,
	Female, Horseradish Peroxidase, Humans, Imagery (Psychotherapy),
	Imagination, Laterality, Male, Monoclonal, Movement, Neoplasms, Non-P.H.S.,
	Non-U.S. Gov't, P.H.S., Perception, Principal Component Analysis,
	Protein, Protein Array Analysis, Proteins, Research Support, Sensitivity
	and Specificity, Sequence Analysis, Tumor Markers, U.S. Gov't, User-Computer
	Interface, 15142321},
  url = {http://dx.doi.org/10.1179/016164104225013950}
}
@article{Vassetzky2009Chromosome,
  author = {Yegor Vassetzky and Alexey Gavrilov and Elvira Eivazova and Iryna
	Priozhkova and Marc Lipinski and Sergey Razin},
  title = {Chromosome conformation capture (from 3C to 5C) and its ChIP-based
	modification.},
  journal = {Methods Mol Biol},
  year = {2009},
  volume = {567},
  pages = {171--188},
  abstract = {Chromosome conformation capture (3C) methodology was developed to
	study spatial organization of long genomic regions in living cells.
	Briefly, chromatin is fixed with formaldehyde in vivo to cross-link
	interacting sites, digested with a restriction enzyme and ligated
	at a low DNA concentration so that ligation between cross-linked
	fragments is favored over ligation between random fragments. Ligation
	products are then analyzed and quantified by PCR. So far, semi-quantitative
	PCR methods were widely used to estimate the ligation frequencies.
	However, it is often important to estimate the ligation frequencies
	more precisely which is only possible by using the real-time PCR.
	At the same time, it is equally necessary to monitor the specificity
	of PCR amplification. That is why the real-time PCR with TaqMan probes
	is becoming more and more popular in 3C studies. In this chapter,
	we describe the general protocol for 3C analysis with the subsequent
	estimation of ligation frequencies by using the real-time PCR technology
	with TaqMan probes. We discuss in details all steps of the experimental
	procedure paying special attention to weak points and possible ways
	to solve the problems. A special attention is also paid to the problems
	in interpretation of the results and necessary control experiments.
	Besides, in theory, we consider other approaches to analysis of the
	ligation products used in frames of the so-called 4C and 5C methods.
	The recently developed chromatin immunoprecipitation (ChIP)-loop
	assay representing a combination of 3C and ChIP is also discussed.},
  doi = {10.1007/978-1-60327-414-2\_12},
  institution = {CNRS UMR-8126, Université Paris-Sud 11, Institut de Cancérologie
	Gustave Roussy, 39, rue Camille-Desmoulins, 94805, Villejuif.},
  keywords = {Chromatin Immunoprecipitation; Chromosome Mapping; Chromosomes; Cross-Linking
	Reagents; Humans; Models, Biological; Nucleic Acid Conformation;
	Polymerase Chain Reaction; Quality Control},
  owner = {phupe},
  pmid = {19588093},
  timestamp = {2010.08.11},
  url = {http://dx.doi.org/10.1007/978-1-60327-414-2\_12}
}
@unpublished{Vazquez2001Modeling,
  author = {Vazquez, A. and Flammini, A. and Maritan, A. and Vespignani, A.},
  title = {Modeling of protein interaction networks},
  note = {E-print cond-mat/0108043},
  month = {Aug},
  year = {2001},
  pdf = {../local/vazq01.pdf},
  file = {vazq01.pdf:local/vazq01.pdf:PDF},
  subject = {bionetprot},
  url = {http://xxx.lanl.gov/abs/cond-mat/0108043}
}
@article{Venter2001Sequence,
  author = {Venter, J. C. et al.},
  title = {The {S}equence of the {H}uman {G}enome},
  journal = {Science},
  year = {2001},
  volume = {291},
  pages = {1304-1351},
  number = {5507},
  abstract = {A 2.91-billion base pair (bp) consensus sequence of the euchromatic
	portion of the human genome was generated by the whole-genome shotgun
	sequencing method. {T}he 14.8-billion bp {DNA} sequence was generated
	over 9 months from 27,271,853 high-quality sequence reads (5.11-fold
	coverage of the genome) from both ends of plasmid clones made from
	the {DNA} of five individuals. {T}wo assembly strategies--a whole-genome
	assembly and a regional chromosome assembly--were used, each combining
	sequence data from {C}elera and the publicly funded genome effort.
	{T}he public data were shredded into 550-bp segments to create a
	2.9-fold coverage of those genome regions that had been sequenced,
	without including biases inherent in the cloning and assembly procedure
	used by the publicly funded group. {T}his brought the effective coverage
	in the assemblies to eightfold, reducing the number and size of gaps
	in the final assembly over what would be obtained with 5.11-fold
	coverage. {T}he two assembly strategies yielded very similar results
	that largely agree with independent mapping data. {T}he assemblies
	effectively cover the euchromatic regions of the human chromosomes.
	{M}ore than 90% of the genome is in scaffold assemblies of 100,000
	bp or more, and 25% of the genome is in scaffolds of 10 million bp
	or larger. {A}nalysis of the genome sequence revealed 26,588 protein-encoding
	transcripts for which there was strong corroborating evidence and
	an additional ~12,000 computationally derived genes with mouse matches
	or other weak supporting evidence. {A}lthough gene-dense clusters
	are obvious, almost half the genes are dispersed in low {G}+{C} sequence
	separated by large tracts of apparently noncoding sequence. {O}nly
	1.1% of the genome is spanned by exons, whereas 24% is in introns,
	with 75% of the genome being intergenic {DNA}. {D}uplications of
	segmental blocks, ranging in size up to chromosomal lengths, are
	abundant throughout the genome and reveal a complex evolutionary
	history. {C}omparative genomic analysis indicates vertebrate expansions
	of genes associated with neuronal function, with tissue-specific
	developmental regulation, and with the hemostasis and immune systems.
	{DNA} sequence comparisons between the consensus sequence and publicly
	funded genome data provided locations of 2.1 million single-nucleotide
	polymorphisms ({SNP}s). {A} random pair of human haploid genomes
	differed at a rate of 1 bp per 1250 on average, but there was marked
	heterogeneity in the level of polymorphism across the genome. {L}ess
	than 1% of all {SNP}s resulted in variation in proteins, but the
	task of determining which {SNP}s have functional consequences remains
	an open challenge.},
  pdf = {../local/Venter2001Sequence.pdf},
  file = {Venter2001Sequence.pdf:local/Venter2001Sequence.pdf:PDF},
  keywords = {genomics bio},
  owner = {vert},
  url = {http://www.sciencemag.org/cgi/content/abstract/291/5507/1304}
}
@article{Vercoutere2001Rapid,
  author = {W. Vercoutere and S. Winters-Hilt and H. Olsen and D. Deamer and
	D. Haussler and M. Akeson},
  title = {Rapid discrimination among individual {DNA} hairpin molecules at
	single-nucleotide resolution using an ion channel.},
  journal = {Nat {B}iotechnol},
  year = {2001},
  volume = {19},
  pages = {248-52},
  number = {3},
  month = {Mar},
  abstract = {R{NA} and {DNA} strands produce ionic current signatures when driven
	through an alpha-hemolysin channel by an applied voltage. {H}ere
	we combine this nanopore detector with a support vector machine ({SVM})
	to analyze {DNA} hairpin molecules on the millisecond time scale.
	{M}easurable properties include duplex stem length, base pair mismatches,
	and loop length. {T}his nanopore instrument can discriminate between
	individual {DNA} hairpins that differ by one base pair or by one
	nucleotide.},
  doi = {10.1038/85696},
  pdf = {../local/Vercoutere2001Rapid.pdf},
  file = {Vercoutere2001Rapid.pdf:local/Vercoutere2001Rapid.pdf:PDF},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Artificial Intelligence, Automated, B-Lymphocytes, Bacterial Proteins,
	Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding Sites,
	Biological, Bone Marrow Cells, Cell Compartmentation, Chemistry,
	Child, Chromosome Aberrations, Comparative Study, Computational Biology,
	Computer Simulation, Computer-Assisted, DNA, Data Interpretation,
	Databases, Decision Trees, Diagnosis, Discriminant Analysis, Electric
	Conductivity, Electrophysiology, Escherichia coli Proteins, Factual,
	Female, Fungal, Gastric Emptying, Gene Expression Profiling, Gene
	Expression Regulation, Genes, Genetic, Genetic Markers, Hemolysins,
	Humans, Ion Channels, Kinetics, Leukemia, Lipid Bilayers, Logistic
	Models, Lymphocytic, Male, Markov Chains, Melanoma, Models, Molecular,
	Myeloid, Neoplasm, Neoplastic, Neural Networks (Computer), Nevus,
	Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Conformation, Organ Specificity,
	Organelles, P.H.S., Pattern Recognition, Physical, Pigmented, Predictive
	Value of Tests, Promoter Regions (Genetics), Protein Folding, Protein
	Structure, Proteins, Proteome, RNA, Reproducibility of Results, Research
	Support, Saccharomyces cerevisiae, Secondary, Sensitivity and Specificity,
	Sequence Alignment, Sex Characteristics, Skin Diseases, Skin Neoplasms,
	Skin Pigmentation, Software, Statistical, Stomach Diseases, T-Lymphocytes,
	Thermodynamics, Transcription, Transcription Factors, Tumor Markers,
	U.S. Gov't, 11231558},
  pii = {85696},
  url = {http://dx.doi.org/10.1038/85696}
}
@inproceedings{Vert2002Support,
  author = {Vert, J.-P.},
  title = {Support vector machine prediction of signal peptide cleavage site
	using a new class of kernels for strings},
  booktitle = {Proceedings of the {P}acific {S}ymposium on {B}iocomputing 2002},
  year = {2002},
  editor = {R. B. Altman and A. K. Dunker and L. Hunter and K. Lauerdale and
	T. E. Klein},
  pages = {649--660},
  publisher = {World Scientific},
  pdf = {../local/vert02.pdf},
  file = {vert02.pdf:local/vert02.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://www.smi.stanford.edu/projects/helix/psb02/vert.pdf}
}
@techreport{Vert2005Kernel,
  author = {Vert, J.-P.},
  title = {Kernel methods in computational biology},
  institution = {CNRS-HAL},
  year = {2005},
  number = {ccsd-00012124},
  month = {Oct},
  abstract = {Support vector machines and kernel methods are increasingly popular
	in genomics and computational biology, due to their good performance
	in real-world applications and strong modularity that makes them
	suitable to a wide range of problems, from the classification of
	tumors to the automatic annotation of proteins. {T}heir ability to
	work in high dimension, to process non-vectorial data, and the natural
	framework they provide to integrate heterogeneous data are particularly
	relevant to various problems arising in computational biology. {I}n
	this chapter we survey some of the most prominent applications published
	so far, highlighting the particular developments in kernel methods
	triggered by problems in biology, and mention a few promising research
	directions likely to expand in the future.},
  pdf = {../local/Vert2005Kernel.pdf},
  file = {Vert2005Kernel.pdf:local/Vert2005Kernel.pdf:PDF},
  keywords = {biosvm},
  url = {http://hal.ccsd.cnrs.fr/ccsd-00012124}
}
@article{Vert2002tree,
  author = {Vert, J.-P.},
  title = {A tree kernel to analyze phylogenetic profiles},
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {S276--S284},
  pdf = {../local/vert02b.pdf},
  file = {vert02b.pdf:local/vert02b.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://cbio.ensmp.fr/~jvert/publi/ismb02/index.html}
}
@inproceedings{Vert2003Graph-driven,
  author = {Vert, J.-P. and Kanehisa, M.},
  title = {Graph-driven features extraction from microarray data using diffusion
	kernels and kernel {CCA}},
  booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.},
  year = {2003},
  editor = {S. Becker and S. Thrun and K. Obermayer},
  pages = {1449--1456},
  publisher = {MIT Press},
  pdf = {../local/Vert2003Graph-driven.pdf},
  file = {Vert2003Graph-driven.pdf:local/Vert2003Graph-driven.pdf:PDF},
  keywords = {biosvm}
}
@article{Vert2003Extracting,
  author = {Vert, J.-P. and Kanehisa, M. },
  title = {Extracting active pathways from gene expression data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {238ii-234ii},
  abstract = {Motivation: {A} promising way to make sense out of gene expression
	profiles is to relate them to the activity of metabolic and signalling
	pathways. {E}ach pathway usually involves many genes, such as enzymes,
	which can themselves participate in many pathways. {T}he set of all
	known pathways can therefore be represented by a complex network
	of genes. {S}earching for regularities in the set of gene expression
	profiles with respect to the topology of this gene network is a way
	to automatically extract active pathways and their associated patterns
	of activity. {M}ethod: {W}e present a method to perform this task,
	which consists in encoding both the gene network and the set of profiles
	into two kernel functions, and performing a regularized form of canonical
	correlation analysis between the two kernels. {R}esults: {W}hen applied
	to publicly available expression data the method is able to extract
	biologically relevant expression patterns, as well as pathways with
	related activity.},
  pdf = {../local/Vert2003Extracting.pdf},
  file = {Vert2003Extracting.pdf:local/Vert2003Extracting.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_2/ii238}
}
@techreport{Vert2002Graph-driven,
  author = {Vert, J.-P. and Kanehisa, M.},
  title = {Graph-driven features extraction from microarray data},
  institution = {Arxiv physics},
  year = {2002},
  number = {0206055},
  keywords = {biosvm}
}
@incollection{Vert2004Local,
  author = {Vert, J.-P. and Saigo, H. and Akutsu, T.},
  title = {Local alignment kernels for biological sequences},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Sch{\"o}lkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {131-154},
  address = {The MIT Press, Cambridge, Massachussetts},
  pdf = {../local/saigo.pdf:http\},
  file = {saigo.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/saigo.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@inproceedings{Vert2006Kernels,
  author = {Vert, J.-P. and Thurman, R. and Noble, W. S.},
  title = {Kernels for gene regulatory regions},
  booktitle = {Adv. {N}eural. {I}nform. {P}rocess {S}yst.},
  year = {2006},
  editor = {Y. Weiss and B. Sch\"{o}lkopf and J. Platt},
  volume = {18},
  pages = {1401-1408},
  address = {Cambridge, MA},
  publisher = {MIT Press},
  keywords = {biosvm}
}
@incollection{Vert2004primer,
  author = {Vert, J.-P. and Tsuda, K. and Sch{\"o}lkopf, B.},
  title = {A primer on kernel methods},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {35-70},
  keywords = {biosvm},
  owner = {vert}
}
@inproceedings{Vert2005Supervised,
  author = {Vert, J.-P. and Yamanishi, Y.},
  title = {Supervised graph inference},
  booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.},
  year = {2005},
  editor = {Saul, L. K. and Weiss, Y. and Bottou, L.},
  volume = {17},
  pages = {1433-1440},
  publisher = {MIT Press, Cambridge, MA},
  pdf = {../local/nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF;nips2004.pdf:http\},
  file = {nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF;nips2004.pdf:http\://cg.ensmp.fr/~vert/publi/04nips_yamanishi/nips2004.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Vinayagam2004Applying,
  author = {Vinayagam, A. and König, R. and Moormann, J. and Schubert, F. and
	Eils, R. and Glatting, K.-H. and Suhai, S.},
  title = {Applying {S}upport {V}ector {M}achines for {G}ene {O}ntology based
	gene function prediction.},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  pages = {116},
  number = {1},
  month = {Aug},
  abstract = {B{ACKGROUND}: {T}he current progress in sequencing projects calls
	for rapid, reliable and accurate function assignments of gene products.
	{A} variety of methods has been designed to annotate sequences on
	a large scale. {H}owever, these methods can either only be applied
	for specific subsets, or their results are not formalised, or they
	do not provide precise confidence estimates for their predictions.
	{RESULTS}: {W}e have developed a large-scale annotation system that
	tackles all of these shortcomings. {I}n our approach, annotation
	was provided through {G}ene {O}ntology terms by applying multiple
	{S}upport {V}ector {M}achines ({SVM}) for the classification of correct
	and false predictions. {T}he general performance of the system was
	benchmarked with a large dataset. {A}n organism-wise cross-validation
	was performed to define confidence estimates, resulting in an average
	precision of 80\% for 74\% of all test sequences. {T}he validation
	results show that the prediction performance was organism-independent
	and could reproduce the annotation of other automated systems as
	well as high-quality manual annotations. {W}e applied our trained
	classification system to {X}enopus laevis sequences, yielding functional
	annotation for more than half of the known expressed genome. {C}ompared
	to the currently available annotation, we provided more than twice
	the number of contigs with good quality annotation, and additionally
	we assigned a confidence value to each predicted {GO} term. {CONCLUSIONS}:
	{W}e present a complete automated annotation system that overcomes
	many of the usual problems by applying a controlled vocabulary of
	{G}ene {O}ntology and an established classification method on large
	and well-described sequence data sets. {I}n a case study, the function
	for {X}enopus laevis contig sequences was predicted and the results
	are publicly available at ftp://genome.dkfz-heidelberg.de/pub/agd/gene_association.agd_{X}enopus.},
  doi = {10.1186/1471-2105-5-116},
  pdf = {../local/Vinayagam2004Applying.pdf},
  file = {Vinayagam2004Applying.pdf:local/Vinayagam2004Applying.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-5-116},
  url = {http://dx.doi.org/10.1186/1471-2105-5-116}
}
@article{Vlahovicek2005SBASE,
  author = {Kristian Vlahovicek and László Kaján and Vilmos Agoston and Sándor
	Pongor},
  title = {The {SBASE} domain sequence resource, release 12: prediction of protein
	domain-architecture using support vector machines.},
  journal = {Nucleic {A}cids {R}es},
  year = {2005},
  volume = {33},
  pages = {D223-5},
  number = {Database issue},
  month = {Jan},
  abstract = {S{BASE} (http://www.icgeb.trieste.it/sbase) is an online resource
	designed to facilitate the detection of domain homologies based on
	sequence database search. {T}he present release of the {SBASE} {A}
	library of protein domain sequences contains 972,397 protein sequence
	segments annotated by structure, function, ligand-binding or cellular
	topology, clustered into 8547 domain groups. {SBASE} {B} contains
	169,916 domain sequences clustered into 2526 less well-characterized
	groups. {D}omain prediction is based on an evaluation of database
	search results in comparison with a 'similarity network' of inter-sequence
	similarity scores, using support vector machines trained on similarity
	search results of known domains.},
  doi = {10.1093/nar/gki112},
  pdf = {../local/Vlahovicek2005SBASE.pdf},
  file = {Vlahovicek2005SBASE.pdf:local/Vlahovicek2005SBASE.pdf:PDF},
  keywords = {biosvm},
  pii = {33/suppl_1/D223},
  url = {http://dx.doi.org/10.1093/nar/gki112}
}
@article{Voduc2010Breast,
  author = {K. David Voduc and Maggie C U Cheang and Scott Tyldesley and Karen
	Gelmon and Torsten O Nielsen and Hagen Kennecke},
  title = {Breast cancer subtypes and the risk of local and regional relapse.},
  journal = {J Clin Oncol},
  year = {2010},
  volume = {28},
  pages = {1684--1691},
  number = {10},
  month = {Apr},
  abstract = {The risk of local and regional relapse associated with each breast
	cancer molecular subtype was determined in a large cohort of patients
	with breast cancer. Subtype assignment was accomplished using a validated
	six-marker immunohistochemical panel applied to tissue microarrays.Semiquantitative
	analysis of estrogen receptor (ER), progesterone receptor (PR), Ki-67,
	human epidermal growth factor receptor 2 (HER2), epidermal growth
	factor receptor (EGFR), and cytokeratin (CK) 5/6 was performed on
	tissue microarrays constructed from 2,985 patients with early invasive
	breast cancer. Patients were classified into the following categories:
	luminal A, luminal B, luminal-HER2, HER2 enriched, basal-like, or
	triple-negative phenotype-nonbasal. Multivariable Cox analysis was
	used to determine the risk of local or regional relapse associated
	the intrinsic subtypes, adjusting for standard clinicopathologic
	factors.The intrinsic molecular subtype was successfully determined
	in 2,985 tumors. The median follow-up time was 12 years, and there
	have been a total of 325 local recurrences and 227 regional lymph
	node recurrences. Luminal A tumors (ER or PR positive, HER2 negative,
	Ki-67 < 1\%) had the best prognosis and the lowest rate of local
	or regional relapse. For patients undergoing breast conservation,
	HER2-enriched and basal subtypes demonstrated an increased risk of
	regional recurrence, and this was statistically significant on multivariable
	analysis. After mastectomy, luminal B, luminal-HER2, HER2-enriched,
	and basal subtypes were all associated with an increased risk of
	local and regional relapse on multivariable analysis.Luminal A tumors
	are associated with a low risk of local or regional recurrence. Molecular
	subtyping of breast tumors using a six-marker immunohistochemical
	panel can identify patients at increased risk of local and regional
	recurrence.},
  doi = {10.1200/JCO.2009.24.9284},
  pdf = {../local/Voduc2010Breast.pdf},
  file = {Voduc2010Breast.pdf:Voduc2010Breast.pdf:PDF},
  institution = {Department of Radiation Oncology, British Columbia Cancer Agency,
	Vancouver, British Columbia, Canada V5Z 4E6. dvoduc@bccancer.bc.ca},
  keywords = {Adult; Breast Neoplasms, mortality/pathology; Female; Humans; Ki-67
	Antigen, metabolism; Lymphatic Metastasis; Middle Aged; Neoplasm
	Metastasis; Neoplasm Recurrence, Local; Neoplasms, Hormone-Dependent;
	Receptor, Epidermal Growth Factor, metabolism; Receptors, Estrogen,
	analysis; Receptors, Progesterone, analysis; Tissue Array Analysis;
	Tumor Markers, Biological, analysis},
  language = {eng},
  medline-pst = {ppublish},
  owner = {phupe},
  pii = {JCO.2009.24.9284},
  pmid = {20194857},
  timestamp = {2011.06.01},
  url = {http://dx.doi.org/10.1200/JCO.2009.24.9284}
}
@article{Wagner2001Yeast,
  author = {Wagner, A.},
  title = {The {Y}east {P}rotein {I}nteraction {N}etwork {E}volves {R}apidly
	and {C}ontains {F}ew {R}edundant {D}uplicate {G}enes},
  journal = {Mol. {B}iol. {E}vol.},
  year = {2001},
  volume = {18},
  pages = {1283--1292},
  pdf = {../local/wagn01.pdf},
  file = {wagn01.pdf:local/wagn01.pdf:PDF},
  subject = {bionet},
  url = {http://www.santafe.edu/sfi/publications/Abstracts/01-04-022abs.html}
}
@article{Wagner2003Protocols,
  author = {Wagner, M. and Naik, D. and Pothen, A.},
  title = {Protocols for disease classification from mass spectrometry data.},
  journal = {Proteomics},
  year = {2003},
  volume = {3},
  pages = {1692-1698},
  number = {9},
  abstract = {We report our results in classifying protein matrix-assisted laser
	desorption/ionization-time of flight mass spectra obtained from serum
	samples into diseased and healthy groups. {W}e discuss in detail
	five of the steps in preprocessing the mass spectral data for biomarker
	discovery, as well as our criterion for choosing a small set of peaks
	for classifying the samples. {C}ross-validation studies with four
	selected proteins yielded misclassification rates in the 10-15% range
	for all the classification methods. {T}hree of these proteins or
	protein fragments are down-regulated and one up-regulated in lung
	cancer, the disease under consideration in this data set. {W}hen
	cross-validation studies are performed, care must be taken to ensure
	that the test set does not influence the choice of the peaks used
	in the classification. {M}isclassification rates are lower when both
	the training and test sets are used to select the peaks used in classification
	versus when only the training set is used. {T}his expectation was
	validated for various statistical discrimination methods when thirteen
	peaks were used in cross-validation studies. {O}ne particular classification
	method, a linear support vector machine, exhibited especially robust
	performance when the number of peaks was varied from four to thirteen,
	and when the peaks were selected from the training set alone. {E}xperiments
	with the samples randomly assigned to the two classes confirmed that
	misclassification rates were significantly higher in such cases than
	those observed with the true data. {T}his indicates that our findings
	are indeed significant. {W}e found closely matching masses in a database
	for protein expression in lung cancer for three of the four proteins
	we used to classify lung cancer. {D}ata from additional samples,
	increased experience with the performance of various preprocessing
	techniques, and affirmation of the biological roles of the proteins
	that help in classification, will strengthen our conclusions in the
	future.},
  doi = {10.1002/pmic.200300519},
  pdf = {../local/Wagner2003Protocols.pdf},
  file = {Wagner2003Protocols.pdf:local/Wagner2003Protocols.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/pmic.200300519}
}
@article{Wagner2004Computational,
  author = {Wagner, M. and Naik, D.N. and Pothen, A. and Kasukurti, S. and Devineni,
	R.R. and Adam, B.L. and Semmes, O.J. and Wright Jr, G.L.},
  title = {Computational protein biomarker prediction: a case study for prostate
	cancer},
  journal = {B{MC} {B}ioinformatics},
  year = {2004},
  volume = {5},
  number = {26},
  abstract = {Background {R}ecent technological advances in mass spectrometry pose
	challenges in computational mathematics and statistics to process
	the mass spectral data into predictive models with clinical and biological
	significance. {W}e discuss several classification-based approaches
	to finding protein biomarker candidates using protein profiles obtained
	via mass spectrometry, and we assess their statistical significance.
	{O}ur overall goal is to implicate peaks that have a high likelihood
	of being biologically linked to a given disease state, and thus to
	narrow the search for biomarker candidates. {R}esults {T}horough
	cross-validation studies and randomization tests are performed on
	a prostate cancer dataset with over 300 patients, obtained at the
	{E}astern {V}irginia {M}edical {S}chool using {SELDI}-{TOF} mass
	spectrometry. {W}e obtain average classification accuracies of 87%
	on a four-group classification problem using a two-stage linear {SVM}-based
	procedure and just 13 peaks, with other methods performing comparably.
	{C}onclusions {M}odern feature selection and classification methods
	are powerful techniques for both the identification of biomarker
	candidates and the related problem of building predictive models
	from protein mass spectrometric profiles. {C}ross-validation and
	randomization are essential tools that must be performed carefully
	in order not to bias the results unfairly. {H}owever, only a biological
	validation and identification of the underlying proteins will ultimately
	confirm the actual value and power of any computational predictions.},
  doi = {10.1186/1471-2105-5-26},
  pdf = {../local/Wagner2004Computational.pdf},
  file = {Wagner2004Computational.pdf:local/Wagner2004Computational.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://www.biomedcentral.com/1471-2105/5/26}
}
@article{Wahba2002Soft,
  author = {Grace Wahba},
  title = {Soft and hard classification by reproducing kernel {H}ilbert space
	methods.},
  journal = {Proc {N}atl {A}cad {S}ci {U} {S} {A}},
  year = {2002},
  volume = {99},
  pages = {16524-30},
  number = {26},
  month = {Dec},
  abstract = {Reproducing kernel {H}ilbert space ({RKHS}) methods provide a unified
	context for solving a wide variety of statistical modelling and function
	estimation problems. {W}e consider two such problems: {W}e are given
	a training set [yi, ti, i = 1, em leader, n], where yi is the response
	for the ith subject, and ti is a vector of attributes for this subject.
	{T}he value of y(i) is a label that indicates which category it came
	from. {F}or the first problem, we wish to build a model from the
	training set that assigns to each t in an attribute domain of interest
	an estimate of the probability pj(t) that a (future) subject with
	attribute vector t is in category j. {T}he second problem is in some
	sense less ambitious; it is to build a model that assigns to each
	t a label, which classifies a future subject with that t into one
	of the categories or possibly "none of the above." {T}he approach
	to the first of these two problems discussed here is a special case
	of what is known as penalized likelihood estimation. {T}he approach
	to the second problem is known as the support vector machine. {W}e
	also note some alternate but closely related approaches to the second
	problem. {T}hese approaches are all obtained as solutions to optimization
	problems in {RKHS}. {M}any other problems, in particular the solution
	of ill-posed inverse problems, can be obtained as solutions to optimization
	problems in {RKHS} and are mentioned in passing. {W}e caution the
	reader that although a large literature exists in all of these topics,
	in this inaugural article we are selectively highlighting work of
	the author, former students, and other collaborators.},
  doi = {10.1073/pnas.242574899},
  pdf = {../local/Wahba2002Soft.pdf},
  file = {Wahba2002Soft.pdf:local/Wahba2002Soft.pdf:PDF},
  keywords = {Acute, Algorithms, Animals, Automated, Base Pair Mismatch, Base Pairing,
	Base Sequence, Biological, Biosensing Techniques, Classification,
	Cluster Analysis, Comparative Study, Computational Biology, Computer-Assisted,
	Cystadenoma, DNA, Drug, Drug Design, Eukaryotic Cells, Female, Gene
	Expression, Gene Expression Profiling, Gene Expression Regulation,
	Genes, Genetic, Genetic Markers, Hemolysins, Humans, Leukemia, Ligands,
	Likelihood Functions, Lymphocytic, Markov Chains, Mathematics, Messenger,
	Models, Molecular, Molecular Probe Techniques, Molecular Sequence
	Data, Nanotechnology, Neoplasm, Neoplastic, Neural Networks (Computer),
	Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Conformation, Observer Variation,
	Oligonucleotide Array Sequence Analysis, Ovarian Neoplasms, P.H.S.,
	Pattern Recognition, Probability, Protein Binding, Proteins, Quality
	Control, RNA, RNA Splicing, Receptors, Reference Values, Reproducibility
	of Results, Research Support, Sensitivity and Specificity, Sequence
	Analysis, Signal Processing, Statistical, Stomach Neoplasms, Thermodynamics,
	Transcription, Tumor Markers, U.S. Gov't, 12477931},
  pii = {242574899},
  url = {http://dx.doi.org/10.1073/pnas.242574899}
}
@article{Wang2005Protein,
  author = {Wang, J. and Sung, W.-K. and Krishnan, A. and Li, K.-B.},
  title = {Protein subcellular localization prediction for {G}ram-negative bacteria
	using amino acid subalphabets and a combination of multiple support
	vector machines.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6},
  pages = {174},
  number = {1},
  month = {Jul},
  abstract = {B{ACKGROUND}: {P}redicting the subcellular localization of proteins
	is important for determining the function of proteins. {P}revious
	works focused on predicting protein localization in {G}ram-negative
	bacteria obtained good results. {H}owever, these methods had relatively
	low accuracies for the localization of extracellular proteins. {T}his
	paper studies ways to improve the accuracy for predicting extracellular
	localization in {G}ram-negative bacteria. {RESULTS}: {W}e have developed
	a system for predicting the subcellular localization of proteins
	for {G}ram-negative bacteria based on amino acid subalphabets and
	a combination of multiple support vector machines. {T}he recall of
	the extracellular site and overall recall of our predictor reach
	86.0\% and 89.8\%, respectively, in 5-fold cross-validation. {T}o
	the best of our knowledge, these are the most accurate results for
	predicting subcellular localization in {G}ram-negative bacteria.
	{CONCLUSIONS}: {C}lustering 20 amino acids into a few groups by the
	proposed greedy algorithm provides a new way to extract features
	from protein sequences to cover more adjacent amino acids and hence
	reduce the dimensionality of the input vector of protein features.
	{I}t was observed that a good amino acid grouping leads to an increase
	in prediction performance. {F}urthermore, a proper choice of a subset
	of complementary support vector machines constructed by different
	features of proteins maximizes the prediction accuracy.},
  doi = {10.1186/1471-2105-6-174},
  pdf = {../local/Wang2005Protein.pdf},
  file = {Wang2005Protein.pdf:local/Wang2005Protein.pdf:PDF},
  keywords = {biosvm},
  pii = {1471-2105-6-174},
  url = {http://dx.doi.org/10.1186/1471-2105-6-174}
}
@article{Wang2004Simple,
  author = {Kai Wang and Ekachai Jenwitheesuk and Ram Samudrala and John E Mittler},
  title = {Simple linear model provides highly accurate genotypic predictions
	of {HIV}-1 drug resistance.},
  journal = {Antivir {T}her},
  year = {2004},
  volume = {9},
  pages = {343-52},
  number = {3},
  month = {Jun},
  abstract = {Drug resistance is a major obstacle to the successful treatment of
	{HIV}-1 infection. {G}enotypic assays are used widely to provide
	indirect evidence of drug resistance, but the performance of these
	assays has been mixed. {W}e used standard stepwise linear regression
	to construct drug resistance models for seven protease inhibitors
	and 10 reverse transcriptase inhibitors using data obtained from
	the {S}tanford {HIV} drug resistance database. {W}e evaluated these
	models by hold-one-out experiments and by tests on an independent
	dataset. {O}ur linear model outperformed other publicly available
	genotypic interpretation algorithms, including decision tree, support
	vector machine and four rules-based algorithms ({HIV}db, {VGI}, {ANRS}
	and {R}ega) under both tests. {I}nterestingly, our model did well
	despite the absence of any terms for interactions between different
	residues in protease or reverse transcriptase. {T}he resulting linear
	models are easy to understand and can potentially assist in choosing
	combination therapy regimens.},
  keywords = {Algorithms, Computational Biology, Databases, Drug Resistance, Forecasting,
	Genetic, Genotype, HIV Protease Inhibitors, HIV-1, Humans, Information
	Management, Information Storage and Retrieval, Kinetics, Linear Models,
	Microbial Sensitivity Tests, Models, Non-U.S. Gov't, P.H.S., Periodicals,
	Point Mutation, Pyrimidinones, Research Support, Reverse Transcriptase
	Inhibitors, Theoretical, U.S. Gov't, Viral, 15259897}
}
@article{Wang2004Predicting,
  author = {Long-Hui Wang and Juan Liu and Yan-Fu Li and Huai-Bei Zhou},
  title = {Predicting protein secondary structure by a support vector machine
	based on a new coding scheme.},
  journal = {Genome {I}nform {S}er {W}orkshop {G}enome {I}nform},
  year = {2004},
  volume = {15},
  pages = {181-90},
  number = {2},
  abstract = {Protein structure prediction is one of the most important problems
	in modern computational biology. {P}rotein secondary structure prediction
	is a key step in prediction of protein tertiary structure. {T}here
	have emerged many methods based on machine learning techniques, such
	as neural networks ({NN}) and support vector machine ({SVM}) etc.,
	to focus on the prediction of the secondary structures. {I}n this
	paper, a new method was proposed based on {SVM}. {D}ifferent from
	the existing methods, this method takes into account of the physical-chemical
	properties and structure properties of amino acids. {W}hen tested
	on the most popular dataset {CB}513, it achieved a {Q}(3) accuracy
	of 0.7844, which illustrates that it is one of the top range methods
	for protein of secondary structure prediction.},
  keywords = {biosvm},
  url = {http://www.jsbi.org/journal/GIW04/GIW04F019.html}
}
@article{Wang2005Using,
  author = {M. Wang and J. Yang and K-C. Chou},
  title = {Using string kernel to predict signal peptide cleavage site based
	on subsite coupling model.},
  journal = {Amino {A}cids},
  year = {2005},
  volume = {28},
  pages = {395-402},
  number = {4},
  month = {Jun},
  abstract = {Owing to the importance of signal peptides for studying the molecular
	mechanisms of genetic diseases, reprogramming cells for gene therapy,
	and finding new drugs for healing a specific defect, it is in great
	demand to develop a fast and accurate method to identify the signal
	peptides. {I}ntroduction of the so-called {-3,-1, +1} coupling model
	({C}hou, {K}. {C}.: {P}rotein {E}ngineering, 2001, 14-2, 75-79) has
	made it possible to take into account the coupling effect among some
	key subsites and hence can significantly enhance the prediction quality
	of peptide cleavage site. {B}ased on the subsite coupling model,
	a kind of string kernels for protein sequence is introduced. {I}ntegrating
	the biologically relevant prior knowledge, the constructed string
	kernels can thus be used by any kernel-based method. {A} {S}upport
	vector machines ({SVM}) is thus built to predict the cleavage site
	of signal peptides from the protein sequences. {T}he current approach
	is compared with the classical weight matrix method. {A}t small false
	positive ratios, our method outperforms the classical weight matrix
	method, indicating the current approach may at least serve as a powerful
	complemental tool to other existing methods for predicting the signal
	peptide cleavage site.{T}he software that generated the results reported
	in this paper is available upon requirement, and will appear at http://www.pami.sjtu.edu.cn/wm.},
  doi = {10.1007/s00726-005-0189-6},
  pdf = {../local/Wang2005Using.pdf},
  file = {Wang2005Using.pdf:local/Wang2005Using.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1007/s00726-005-0189-6}
}
@article{Wang2004Weighted-support,
  author = {Wang, M. and Yang, J. and Liu, G.-P. and Xu, Z.-J. and Chou, K.-C.},
  title = {Weighted-support vector machines for predicting membrane protein
	types based on pseudo-amino acid composition},
  journal = {Protein {E}ng. {D}es. {S}el.},
  year = {2004},
  volume = {17},
  pages = {509-516},
  number = {6},
  abstract = {Membrane proteins are generally classified into the following five
	types: (1) type {I} membrane proteins, (2) type {II} membrane proteins,
	(3) multipass transmembrane proteins, (4) lipid chain-anchored membrane
	proteins and (5) {GPI}-anchored membrane proteins. {P}rediction of
	membrane protein types has become one of the growing hot topics in
	bioinformatics. {C}urrently, we are facing two critical challenges
	in this area: first, how to take into account the extremely complicated
	sequence-order effects, and second, how to deal with the highly uneven
	sizes of the subsets in a training dataset. {I}n this paper, stimulated
	by the concept of using the pseudo-amino acid composition to incorporate
	the sequence-order effects, the spectral analysis technique is introduced
	to represent the statistical sample of a protein. {B}ased on such
	a framework, the weighted support vector machine ({SVM}) algorithm
	is applied. {T}he new approach has remarkable power in dealing with
	the bias caused by the situation when one subset in the training
	dataset contains many more samples than the other. {T}he new method
	is particularly useful when our focus is aimed at proteins belonging
	to small subsets. {T}he results obtained by the self-consistency
	test, jackknife test and independent dataset test are encouraging,
	indicating that the current approach may serve as a powerful complementary
	tool to other existing methods for predicting the types of membrane
	proteins.},
  doi = {10.1093/protein/gzh061},
  eprint = {http://peds.oupjournals.org/cgi/reprint/17/6/509.pdf},
  pdf = {../local/Wang2004Weighted-support.pdf},
  file = {Wang2004Weighted-support.pdf:local/Wang2004Weighted-support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1093/protein/gzh061}
}
@article{Wang2004Support,
  author = {M-L. Wang and W-J. Li and M-L. Wang and W-B. Xu},
  title = {Support vector machines for prediction of peptidyl prolyl cis/trans
	isomerization.},
  journal = {J {P}ept {R}es},
  year = {2004},
  volume = {63},
  pages = {23-8},
  number = {1},
  month = {Jan},
  abstract = {A new method for peptidyl prolyl cis/trans isomerization prediction
	based on the theory of support vector machines ({SVM}) was introduced.
	{T}he {SVM} represents a new approach to supervised pattern classification
	and has been successfully applied to a wide range of pattern recognition
	problems. {I}n this study, six training datasets consisting of different
	length local sequence respectively were used. {T}he polynomial kernel
	functions with different parameter d were chosen. {T}he test for
	the independent testing dataset and the jackknife test were both
	carried out. {W}hen the local sequence length was 20-residue and
	the parameter d = 8, the {SVM} method archived the best performance
	with the correct rate for the cis and trans forms reaching 70.4 and
	69.7\% for the independent testing dataset, 76.7 and 76.6\% for the
	jackknife test, respectively. {M}atthew's correlation coefficients
	for the jackknife test could reach about 0.5. {T}he results obtained
	through this study indicated that the {SVM} method would become a
	powerful tool for predicting peptidyl prolyl cis/trans isomerization.},
  keywords = {biosvm},
  pii = {100}
}
@article{Wang2005Prediction,
  author = {Ming-Lei Wang and Hui Yao and Wen-Bo Xu},
  title = {Prediction by support vector machines and analysis by {Z}-score of
	poly-{L}-proline type {II} conformation based on local sequence.},
  journal = {Comput. {B}iol. {C}hem.},
  year = {2005},
  volume = {29},
  pages = {95-100},
  number = {2},
  month = {Apr},
  abstract = {In recent years, the poly-{L}-proline type {II} ({PPII}) conformation
	has gained more and more importance. {T}his structure plays vital
	roles in many biological processes. {B}ut few studies have been made
	to predict {PPII} secondary structures computationally. {T}he support
	vector machine ({SVM}) represents a new approach to supervised pattern
	classification and has been successfully applied to a wide range
	of pattern recognition problems. {I}n this paper, we present a {SVM}
	prediction method of {PPII} conformation based on local sequence.
	{T}he overall accuracy for both the independent testing set and estimate
	of jackknife testing reached approximately 70\%. {M}atthew's correlation
	coefficient ({MCC}) could reach 0.4. {B}y comparing the results of
	training and testing datasets with different sequence identities,
	we suggest that the performance of this method correlates with the
	sequence identity of dataset. {T}he parameter of {SVM} kernel function
	was an important factor to the performance of this method. {T}he
	propensities of residues located at different positions were also
	analyzed. {B}y computing {Z}-scores, we found that {P} and {G} were
	the two most important residues to {PPII} structure conformation.},
  doi = {10.1016/j.compbiolchem.2005.02.002},
  pdf = {../local/Wang2005Prediction.pdf},
  file = {Wang2005Prediction.pdf:local/Wang2005Prediction.pdf:PDF},
  keywords = {biosvm},
  pii = {S1476-9271(05)00017-4},
  url = {http://dx.doi.org/10.1016/j.compbiolchem.2005.02.002}
}
@article{Wang2005Gene,
  author = {Yu Wang and Igor V Tetko and Mark A Hall and Eibe Frank and Axel
	Facius and Klaus F X Mayer and Hans W Mewes},
  title = {Gene selection from microarray data for cancer classification--a
	machine learning approach.},
  journal = {Comput. {B}iol. {C}hem.},
  year = {2005},
  volume = {29},
  pages = {37-46},
  number = {1},
  month = {Feb},
  abstract = {A {DNA} microarray can track the expression levels of thousands of
	genes simultaneously. {P}revious research has demonstrated that this
	technology can be useful in the classification of cancers. {C}ancer
	microarray data normally contains a small number of samples which
	have a large number of gene expression levels as features. {T}o select
	relevant genes involved in different types of cancer remains a challenge.
	{I}n order to extract useful gene information from cancer microarray
	data and reduce dimensionality, feature selection algorithms were
	systematically investigated in this study. {U}sing a correlation-based
	feature selector combined with machine learning algorithms such as
	decision trees, naïve {B}ayes and support vector machines, we show
	that classification performance at least as good as published results
	can be obtained on acute leukemia and diffuse large {B}-cell lymphoma
	microarray data sets. {W}e also demonstrate that a combined use of
	different classification and feature selection approaches makes it
	possible to select relevant genes with high confidence. {T}his is
	also the first paper which discusses both computational and biological
	evidence for the involvement of zyxin in leukaemogenesis.},
  doi = {10.1016/j.compbiolchem.2004.11.001},
  pdf = {../local/Wang2005Gene.pdf},
  file = {Wang2005Gene.pdf:local/Wang2005Gene.pdf:PDF},
  keywords = {biosvm microarray},
  pii = {S1476-9271(04)00108-2},
  url = {http://dx.doi.org/10.1016/j.compbiolchem.2004.11.001}
}
@article{Ward2003Secondary,
  author = {Ward, J. J. and McGuffin, L. J. and Buxton, B. F. and Jones, D. T.},
  title = {Secondary structure prediction with support vector machines},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1650-1655},
  number = {13},
  abstract = {Motivation: {A} new method that uses support vector machines ({SVM}s)
	to predict protein secondary structure is described and evaluated.
	{T}he study is designed to develop a reliable prediction method using
	an alternative technique and to investigate the applicability of
	{SVM}s to this type of bioinformatics problem. {M}ethods: {B}inary
	{SVM}s are trained to discriminate between two structural classes.
	{T}he binary classifiers are combined in several ways to predict
	multi-class secondary structure. {R}esults: {T}he average three-state
	prediction accuracy per protein ({Q}3) is estimated by cross-validation
	to be 77.07 {+/-} 0.26% with a segment overlap ({S}ov) score of 73.32
	{+/-} 0.39%. {T}he {SVM} performs similarly to the 'state-of-the-art'
	{PSIPRED} prediction method on a non-homologous test set of 121 proteins
	despite being trained on substantially fewer examples. {A} simple
	consensus of the {SVM}, {PSIPRED} and {PROF}sec achieves significantly
	higher prediction accuracy than the individual methods. {A}vailability:
	{T}he {SVM} classifier is available from the authors. {W}ork is in
	progress to make the method available on-line and to integrate the
	{SVM} predictions into the {PSIPRED} server.},
  pdf = {../local/Ward2003Secondary.pdf},
  file = {Ward2003Secondary.pdf:local/Ward2003Secondary.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1650}
}
@article{Waring2004Interlaboratory,
  author = {Jeffrey F Waring and Roger G Ulrich and Nick Flint and David Morfitt
	and Arno Kalkuhl and Frank Staedtler and Michael Lawton and Johanna
	M Beekman and Laura Suter},
  title = {Interlaboratory evaluation of rat hepatic gene expression changes
	induced by methapyrilene.},
  journal = {Environ {H}ealth {P}erspect},
  year = {2004},
  volume = {112},
  pages = {439-48},
  number = {4},
  month = {Mar},
  abstract = {Several studies using microarrays have shown that changes in gene
	expression provide information about the mechanism of toxicity induced
	by xenobiotic agents. {N}evertheless, the issue of whether gene expression
	profiles are reproducible across different laboratories remains to
	be determined. {T}o address this question, several members of the
	{H}epatotoxicity {W}orking {G}roup of the {I}nternational {L}ife
	{S}ciences {I}nstitute {H}ealth and {E}nvironmental {S}ciences {I}nstitute
	evaluated the liver gene expression profiles of rats treated with
	methapyrilene ({MP}). {A}nimals were treated at one facility, and
	{RNA} was distributed to five different sites for gene expression
	analysis. {A} preliminary evaluation of the number of modulated genes
	uncovered striking differences between the five different sites.
	{H}owever, additional data analysis demonstrated that these differences
	had an effect on the absolute gene expression results but not on
	the outcome of the study. {F}or all users, unsupervised algorithms
	showed that gene expression allows the distinction of the high dose
	of {MP} from controls and low dose. {I}n addition, the use of a supervised
	analysis method (support vector machines) made it possible to correctly
	classify samples. {I}n conclusion, the results show that, despite
	some variability, robust gene expression changes were consistent
	between sites. {I}n addition, key expression changes related to the
	mechanism of {MP}-induced hepatotoxicity were identified. {T}hese
	results provide critical information regarding the consistency of
	microarray results across different laboratories and shed light on
	the strengths and limitations of expression profiling in drug safety
	analysis.},
  keywords = {biosvm}
}
@article{Warmuth2003Active,
  author = {Warmuth, M. K. and Liao, J. and R{\"a}tsch, G. and Mathieson, M.
	and Putta, S. and Lemmen, C.},
  title = {Active learning with support vector machines in the drug discovery
	process.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {667-673},
  number = {2},
  abstract = {We investigate the following data mining problem from computer-aided
	drug design: {F}rom a large collection of compounds, find those that
	bind to a target molecule in as few iterations of biochemical testing
	as possible. {I}n each iteration a comparatively small batch of compounds
	is screened for binding activity toward this target. {W}e employed
	the so-called "active learning paradigm" from {M}achine {L}earning
	for selecting the successive batches. {O}ur main selection strategy
	is based on the maximum margin hyperplane-generated by "{S}upport
	{V}ector {M}achines". {T}his hyperplane separates the current set
	of active from the inactive compounds and has the largest possible
	distance from any labeled compound. {W}e perform a thorough comparative
	study of various other selection strategies on data sets provided
	by {D}u{P}ont {P}harmaceuticals and show that the strategies based
	on the maximum margin hyperplane clearly outperform the simpler ones.},
  doi = {10.1021/ci025620t},
  pdf = {../local/Warmuth2003Active.pdf},
  file = {Warmuth2003Active.pdf:local/Warmuth2003Active.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1021/ci025620t}
}
@inproceedings{Warmuth2002Active,
  author = {Warmuth, M. K. and R{\"a}tsch, G. and Mathieson, M. and Liao, L.
	and Lemmen, C.},
  title = {Active learning in the drug discovery process},
  booktitle = {Adv. {N}eural {I}nform. {P}rocess. {S}yst.},
  year = {2002},
  editor = {T.G. Dietterich and S. Becker and Z. Ghahramani},
  volume = {14},
  pages = {1449--1456},
  publisher = {MIT Press},
  keywords = {biosvm},
  subject = {qsar}
}
@incollection{Watkins2000Dynamic,
  author = {C. Watkins},
  title = {Dynamic alignment kernels},
  booktitle = {Advances in {L}arge {M}argin {C}lassifiers},
  publisher = {MIT Press},
  year = {2000},
  editor = {A.J. Smola and P.L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans},
  pages = {39--50},
  address = {Cambridge, MA},
  pdf = {../local/Watkins2000Dynamic.pdf},
  file = {Watkins2000Dynamic.pdf:local/Watkins2000Dynamic.pdf:PDF},
  keywords = {biosvm},
  subject = {kernel},
  url = {http://www.cs.rhbnc.ac.uk/home/chrisw/dynk.ps.gz}
}
@article{Watson1953Structure,
  author = {Watson, J. D. and Crick, F. H. C.},
  title = {A {S}tructure for {D}eoxyribose {N}ucleic {A}cid},
  journal = {Nature},
  year = {1953},
  volume = {171},
  pages = {737},
  pdf = {../local/Watson1953Structure.pdf},
  file = {Watson1953Structure.pdf:local/Watson1953Structure.pdf:PDF},
  keywords = { bio},
  owner = {vert},
  url = {http://www.nature.com/genomics/human/watson-crick/index.html}
}
@article{Weathers2004Reduced,
  author = {Weathers, E. A. and Paulaitis, M. E. and Woolf, T. B. and Hoh, J.
	H.},
  title = {Reduced amino acid alphabet is sufficient to accurately recognize
	intrinsically disordered protein.},
  journal = {F{EBS} {L}ett.},
  year = {2004},
  volume = {576},
  pages = {348-352},
  number = {3},
  abstract = {Intrinsically disordered proteins are an important class of proteins
	with unique functions and properties. {H}ere, we have applied a support
	vector machine ({SVM}) trained on naturally occurring disordered
	and ordered proteins to examine the contribution of various parameters
	(vectors) to recognizing proteins that contain disordered regions.
	{W}e find that a {SVM} that incorporates only amino acid composition
	has a recognition accuracy of 87+/-2%. {T}his result suggests that
	composition alone is sufficient to accurately recognize disorder.
	{I}nterestingly, {SVM}s using reduced sets of amino acids based on
	chemical similarity preserve high recognition accuracy. {A} set as
	small as four retains an accuracy of 84+/-2%; this suggests that
	general physicochemical properties rather than specific amino acids
	are important factors contributing to protein disorder.},
  doi = {10.1016/j.febslet.2004.09.036},
  pdf = {../local/Weathers2004Reduced.pdf},
  file = {Weathers2004Reduced.pdf:local/Weathers2004Reduced.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1016/j.febslet.2004.09.036}
}
@article{Weber2002Building,
  author = {Griffin Weber and Staal Vinterbo and Lucila Ohno-Machado},
  title = {Building an asynchronous web-based tool for machine learning classification.},
  journal = {Proc {AMIA} {S}ymp},
  year = {2002},
  pages = {869-73},
  abstract = {Various unsupervised and supervised learning methods including support
	vector machines, classification trees, linear discriminant analysis
	and nearest neighbor classifiers have been used to classify high-throughput
	gene expression data. {S}impler and more widely accepted statistical
	tools have not yet been used for this purpose, hence proper comparisons
	between classification methods have not been conducted. {W}e developed
	free software that implements logistic regression with stepwise variable
	selection as a quick and simple method for initial exploration of
	important genetic markers in disease classification. {T}o implement
	the algorithm and allow our collaborators in remote locations to
	evaluate and compare its results against those of other methods,
	we developed a user-friendly asynchronous web-based application with
	a minimal amount of programming using free, downloadable software
	tools. {W}ith this program, we show that classification using logistic
	regression can perform as well as other more sophisticated algorithms,
	and it has the advantages of being easy to interpret and reproduce.
	{B}y making the tool freely and easily available, we hope to promote
	the comparison of classification methods. {I}n addition, we believe
	our web application can be used as a model for other bioinformatics
	laboratories that need to develop web-based analysis tools in a short
	amount of time and on a limited budget.},
  keywords = {Acute, Algorithms, Animals, Artificial Intelligence, Automated, Base
	Pair Mismatch, Base Pairing, Base Sequence, Biological, Biosensing
	Techniques, Classification, Cluster Analysis, Comparative Study,
	Computational Biology, Computer-Assisted, Cystadenoma, DNA, Drug,
	Drug Design, Eukaryotic Cells, Female, Gene Expression, Gene Expression
	Profiling, Gene Expression Regulation, Genes, Genetic, Genetic Markers,
	Hemolysins, Humans, Internet, Leukemia, Ligands, Likelihood Functions,
	Logistic Models, Lymphocytic, Markov Chains, Mathematics, Messenger,
	Models, Molecular, Molecular Probe Techniques, Molecular Sequence
	Data, Nanotechnology, Neoplasm, Neoplasms, Neoplastic, Neural Networks
	(Computer), Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Conformation,
	Observer Variation, Oligonucleotide Array Sequence Analysis, Ovarian
	Neoplasms, P.H.S., Pattern Recognition, Probability, Protein Binding,
	Proteins, Quality Control, RNA, RNA Splicing, Receptors, Reference
	Values, Reproducibility of Results, Research Support, Sensitivity
	and Specificity, Sequence Analysis, Signal Processing, Software,
	Statistical, Stomach Neoplasms, Thermodynamics, Transcription, Tumor
	Markers, U.S. Gov't, 12463949},
  pii = {D020001919}
}
@article{Weston2003Feature,
  author = {Weston, J. and P{\'e}rez-Cruz, F. and Bousquet, O. and Chapelle,
	O. and Elisseeff, A. and Sch{\"o}lkopf, B.},
  title = {Feature selection and transduction for prediction of molecular bioactivity
	for drug design},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {764-771},
  number = {6},
  abstract = {Motivation: {I}n drug discovery a key task is to identify characteristics
	that separate active (binding) compounds from inactive (non-binding)
	ones. {A}n automated prediction system can help reduce resources
	necessary to carry out this task. {R}esults: {T}wo methods for prediction
	of molecular bioactivity for drug design are introduced and shown
	to perform well in a data set previously studied as part of the {KDD}
	({K}nowledge {D}iscovery and {D}ata {M}ining) {C}up 2001. {T}he data
	is characterized by very few positive examples, a very large number
	of features (describing three-dimensional properties of the molecules)
	and rather different distributions between training and test data.
	{T}wo techniques are introduced specifically to tackle these problems:
	a feature selection method for unbalanced data and a classifier which
	adapts to the distribution of the the unlabeled test data (a so-called
	transductive method). {W}e show both techniques improve identification
	performance and in conjunction provide an improvement over using
	only one of the techniques. {O}ur results suggest the importance
	of taking into account the characteristics in this data which may
	also be relevant in other problems of a similar type. {A}vailability:
	{M}atlab source code is available at http://www.kyb.tuebingen.mpg.de/bs/people/weston/kdd/kdd.html
	{C}ontact: jason.weston@tuebingen.mpg.de {S}upplementary information:
	{S}upplementary material is available at http://www.kyb.tuebingen.mpg.de/bs/people/weston/kdd/kdd.html.},
  pdf = {../local/Weston2003Feature.pdf},
  file = {Weston2003Feature.pdf:local/Weston2003Feature.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/6/764}
}
@article{Wheeler2008Complete,
  author = {David A Wheeler and Maithreyan Srinivasan and Michael Egholm and
	Yufeng Shen and Lei Chen and Amy McGuire and Wen He and Yi-Ju Chen
	and Vinod Makhijani and G. Thomas Roth and Xavier Gomes and Karrie
	Tartaro and Faheem Niazi and Cynthia L Turcotte and Gerard P Irzyk
	and James R Lupski and Craig Chinault and Xing-zhi Song and Yue Liu
	and Ye Yuan and Lynne Nazareth and Xiang Qin and Donna M Muzny and
	Marcel Margulies and George M Weinstock and Richard A Gibbs and Jonathan
	M Rothberg},
  title = {The complete genome of an individual by massively parallel DNA sequencing.},
  journal = {Nature},
  year = {2008},
  volume = {452},
  pages = {872--876},
  number = {7189},
  month = {Apr},
  abstract = {The association of genetic variation with disease and drug response,
	and improvements in nucleic acid technologies, have given great optimism
	for the impact of 'genomic medicine'. However, the formidable size
	of the diploid human genome, approximately 6 gigabases, has prevented
	the routine application of sequencing methods to deciphering complete
	individual human genomes. To realize the full potential of genomics
	for human health, this limitation must be overcome. Here we report
	the DNA sequence of a diploid genome of a single individual, James
	D. Watson, sequenced to 7.4-fold redundancy in two months using massively
	parallel sequencing in picolitre-size reaction vessels. This sequence
	was completed in two months at approximately one-hundredth of the
	cost of traditional capillary electrophoresis methods. Comparison
	of the sequence to the reference genome led to the identification
	of 3.3 million single nucleotide polymorphisms, of which 10,654 cause
	amino-acid substitution within the coding sequence. In addition,
	we accurately identified small-scale (2-40,000 base pair (bp)) insertion
	and deletion polymorphism as well as copy number variation resulting
	in the large-scale gain and loss of chromosomal segments ranging
	from 26,000 to 1.5 million base pairs. Overall, these results agree
	well with recent results of sequencing of a single individual by
	traditional methods. However, in addition to being faster and significantly
	less expensive, this sequencing technology avoids the arbitrary loss
	of genomic sequences inherent in random shotgun sequencing by bacterial
	cloning because it amplifies DNA in a cell-free system. As a result,
	we further demonstrate the acquisition of novel human sequence, including
	novel genes not previously identified by traditional genomic sequencing.
	This is the first genome sequenced by next-generation technologies.
	Therefore it is a pilot for the future challenges of 'personalized
	genome sequencing'.},
  doi = {10.1038/nature06884},
  institution = {Human Genome Sequencing Center, Baylor College of Medicine, One Baylor
	Plaza, Houston, Texas 77030, USA.},
  keywords = {Alleles; Computational Biology; Genetic Predisposition to Disease,
	genetics; Genetic Variation, genetics; Genome, Human, genetics; Genomics,
	economics/methods/trends; Genotype; Humans; Individuality; Male;
	Oligonucleotide Array Sequence Analysis; Polymorphism, Single Nucleotide,
	genetics; Reproducibility of Results; Sensitivity and Specificity;
	Sequence Alignment; Sequence Analysis, DNA, economics/methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {philippe},
  pii = {nature06884},
  pmid = {18421352},
  timestamp = {2010.07.28},
  url = {http://dx.doi.org/10.1038/nature06884}
}
@article{Wilbur2000Boosting,
  author = {W. J. Wilbur},
  title = {Boosting naive {B}ayesian learning on a large subset of {MEDLINE}.},
  journal = {Proc {AMIA} {S}ymp},
  year = {2000},
  pages = {918-22},
  abstract = {We are concerned with the rating of new documents that appear in a
	large database ({MEDLINE}) and are candidates for inclusion in a
	small specialty database ({REBASE}). {T}he requirement is to rank
	the new documents as nearly in order of decreasing potential to be
	added to the smaller database as possible, so as to improve the coverage
	of the smaller database without increasing the effort of those who
	manage this specialty database. {T}o perform this ranking task we
	have considered several machine learning approaches based on the
	naï ve {B}ayesian algorithm. {W}e find that adaptive boosting outperforms
	naï ve {B}ayes, but that a new form of boosting which we term staged
	{B}ayesian retrieval outperforms adaptive boosting. {S}taged {B}ayesian
	retrieval involves two stages of {B}ayesian retrieval and we further
	find that if the second stage is replaced by a support vector machine
	we again obtain a significant improvement over the strictly {B}ayesian
	approach.},
  keywords = {Acute, Acute Disease, Adenocarcinoma, Algorithms, Amino Acid Sequence,
	Animals, Artificial Intelligence, Automated, B-Lymphocytes, Bacterial
	Proteins, Base Pair Mismatch, Base Sequence, Bayes Theorem, Binding
	Sites, Biological, Bone Marrow Cells, Brachyura, Cell Compartmentation,
	Chemistry, Child, Chromosome Aberrations, Classification, Codon,
	Colonic Neoplasms, Comparative Study, Computational Biology, Computer
	Simulation, Computer-Assisted, DNA, Data Interpretation, Databases,
	Decision Trees, Diabetes Mellitus, Diagnosis, Discriminant Analysis,
	Discrimination Learning, Electric Conductivity, Electrophysiology,
	Escherichia coli Proteins, Factual, Feedback, Female, Fungal, Gastric
	Emptying, Gene Expression Profiling, Gene Expression Regulation,
	Genes, Genetic, Genetic Markers, Genetic Predisposition to Disease,
	Genomics, Hemolysins, Humans, Indians, Information Storage and Retrieval,
	Initiator, Ion Channels, Kinetics, Leukemia, Likelihood Functions,
	Lipid Bilayers, Logistic Models, Lymphocytic, MEDLINE, Male, Markov
	Chains, Melanoma, Models, Molecular, Myeloid, Neoplasm, Neoplasms,
	Neoplastic, Neural Networks (Computer), Neurological, Nevus, Non-P.H.S.,
	Non-U.S. Gov't, Nonlinear Dynamics, Normal Distribution, North American,
	Nucleic Acid Conformation, Oligonucleotide Array Sequence Analysis,
	Organ Specificity, Organelles, Ovarian Neoplasms, Ovary, P.H.S.,
	Pattern Recognition, Physical, Pigmented, Predictive Value of Tests,
	Promoter Regions (Genetics), Protein Biosynthesis, Protein Folding,
	Protein Structure, Proteins, Proteome, RNA, Reproducibility of Results,
	Research Support, Saccharomyces cerevisiae, Secondary, Sensitivity
	and Specificity, Sequence Alignment, Sequence Analysis, Sex Characteristics,
	Skin Diseases, Skin Neoplasms, Skin Pigmentation, Software, Sound
	Spectrography, Statistical, Stomach Diseases, T-Lymphocytes, Thermodynamics,
	Transcription, Transcription Factors, Tumor Markers, Type 2, U.S.
	Gov't, Vertebrates, 11080018},
  pii = {D200250}
}
@article{Williams2004Prognostic,
  author = {Williams, R.D. and Hing, S.N. and Greer, B.T. and Whiteford, C.C.
	and Wei, J.S. and Natrajan, R. and Kelsey, A. and Rogers, S. and
	Campbell, C. and Pritchard-Jones, K. and Khan, J.},
  title = {Prognostic classification of relapsing favorable histology {W}ilms
	tumor using c{DNA} microarray expression profiling and support vector
	machines.},
  journal = {Genes {C}hromosomes {C}ancer},
  year = {2004},
  volume = {41},
  pages = {65-79},
  number = {1},
  month = {Sep},
  abstract = {Treatment of {W}ilms tumor has a high success rate, with some 85%
	of patients achieving long-term survival. {H}owever, late effects
	of treatment and management of relapse remain significant clinical
	problems. {I}f accurate prognostic methods were available, effective
	risk-adapted therapies could be tailored to individual patients at
	diagnosis. {F}ew molecular prognostic markers for {W}ilms tumor are
	currently defined, though previous studies have linked allele loss
	on 1p or 16q, genomic gain of 1q, and overexpression from 1q with
	an increased risk of relapse. {T}o identify specific patterns of
	gene expression that are predictive of relapse, we used high-density
	(30 k) c{DNA} microarrays to analyze {RNA} samples from 27 favorable
	histology {W}ilms tumors taken from primary nephrectomies at the
	time of initial diagnosis. {T}hirteen of these tumors relapsed within
	2 years. {G}enes differentially expressed between the relapsing and
	nonrelapsing tumor classes were identified by statistical scoring
	(t test). {T}hese genes encode proteins with diverse molecular functions,
	including transcription factors, developmental regulators, apoptotic
	factors, and signaling molecules. {U}se of a support vector machine
	classifier, feature selection, and test evaluation using cross-validation
	led to identification of a generalizable expression signature, a
	small subset of genes whose expression potentially can be used to
	predict tumor outcome in new samples. {S}imilar methods were used
	to identify genes that are differentially expressed between tumors
	with and without genomic 1q gain. {T}his set of discriminators was
	highly enriched in genes on 1q, indicating close agreement between
	data obtained from expression profiling with data from genomic copy
	number analyses.},
  doi = {10.1002/gcc.20060 },
  pdf = {../local/Williams2004Prognostic.pdf},
  file = {Williams2004Prognostic.pdf:local/Williams2004Prognostic.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/gcc.20060}
}
@article{Wilton2003Comparison,
  author = {D. Wilton and P. Willett and K. Lawson and G. Mullier},
  title = {Comparison of ranking methods for virtual screening in lead-discovery
	programs.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {469-74},
  number = {2},
  abstract = {This paper discusses the use of several rank-based virtual screening
	methods for prioritizing compounds in lead-discovery programs, given
	a training set for which both structural and bioactivity data are
	available. {S}tructures from the {NCI} {AIDS} data set and from the
	{S}yngenta corporate database were represented by two types of fragment
	bit-string and by sets of high-level molecular features. {T}hese
	representations were processed using binary kernel discrimination,
	similarity searching, substructural analysis, support vector machine,
	and trend vector analysis, with the effectiveness of the methods
	being judged by the extent to which active test set molecules were
	clustered toward the top of the resultant rankings. {T}he binary
	kernel discrimination approach yielded consistently superior rankings
	and would appear to have considerable potential for chemical screening
	applications.},
  doi = {10.1021/ci025586i},
  pdf = {../local/Wilton2003Comparison.pdf},
  file = {Wilton2003Comparison.pdf:local/Wilton2003Comparison.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci025586i}
}
@article{Winters-Hilt2003Highly,
  author = {Winters-Hilt, S. and Vercoutere, W. and DeGuzman, V.S. and Deamer,
	D. and Akeson, M. and Haussler, D.},
  title = {Highly accurate classification of {W}atson-{C}rick basepairs on termini
	of single {DNA} molecules.},
  journal = {Biophys. {J}.},
  year = {2003},
  volume = {84},
  pages = {967-976},
  number = {2},
  abstract = {We introduce a computational method for classification of individual
	{DNA} molecules measured by an{alpha} -hemolysin channel detector.
	{W}e show classification with better than 99% accuracy for {DNA}
	hairpin molecules that differ only in their terminal {W}atson-{C}rick
	basepairs. {S}ignal classification was done in silico to establish
	performance metrics (i.e., where train and test data were of known
	type, via single-species data files). {I}t was then performed in
	solution to assay real mixtures of {DNA} hairpins. {H}idden {M}arkov
	{M}odels ({HMM}s) were used with {E}xpectation/{M}aximization for
	denoising and for associating a feature vector with the ionic current
	blockade of the {DNA} molecule. {S}upport {V}ector {M}achines ({SVM}s)
	were used as discriminators, and were the focus of off-line training.
	{A} multiclass {SVM} architecture was designed to place less discriminatory
	load on weaker discriminators, and novel {SVM} kernels were used
	to boost discrimination strength. {T}he tuning on {HMM}s and {SVM}s
	enabled biophysical analysis of the captured molecule states and
	state transitions; structure revealed in the biophysical analysis
	was used for better feature selection.},
  pdf = {../local/Winters-Hilt2003Highly.pdf},
  file = {Winters-Hilt2003Highly.pdf:local/Winters-Hilt2003Highly.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.biophysj.org/cgi/content/abstract/84/2/967}
}
@article{Wu2003Comparison,
  author = {Wu, B. and Abbott, T. and Fishman, D. and McMurray, W. and Mor, G.
	and Stone, K. and Ward, D. and Williams, K. and Zhao, H.},
  title = {Comparison of statistical methods for classification of ovarian cancer
	using mass spectrometry data},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1636-1643},
  number = {13},
  abstract = {Motivation: {N}ovel methods, both molecular and statistical, are urgently
	needed to take advantage of recent advances in biotechnology and
	the human genome project for disease diagnosis and prognosis. {M}ass
	spectrometry ({MS}) holds great promise for biomarker identification
	and genome-wide protein profiling. {I}t has been demonstrated in
	the literature that biomarkers can be identified to distinguish normal
	individuals from cancer patients using {MS} data. {S}uch progress
	is especially exciting for the detection of early-stage ovarian cancer
	patients. {A}lthough various statistical methods have been utilized
	to identify biomarkers from {MS} data, there has been no systematic
	comparison among these approaches in their relative ability to analyze
	{MS} data. {R}esults: {W}e compare the performance of several classes
	of statistical methods for the classification of cancer based on
	{MS} spectra. {T}hese methods include: linear discriminant analysis,
	quadratic discriminant analysis, k-nearest neighbor classifier, bagging
	and boosting classification trees, support vector machine, and random
	forest ({RF}). {T}he methods are applied to ovarian cancer and control
	serum samples from the {N}ational {O}varian {C}ancer {E}arly {D}etection
	{P}rogram clinic at {N}orthwestern {U}niversity {H}ospital. {W}e
	found that {RF} outperforms other methods in the analysis of {MS}
	data. {S}upplementary information: http://bioinformatics.med.yale.edu/proteomics/{B}io{S}upp1.html},
  pdf = {../local/Wu2003Comparison.pdf},
  file = {Wu2003Comparison.pdf:local/Wu2003Comparison.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/13/1636}
}
@article{Xie2005LOCSVMPSI,
  author = {Dan Xie and Ao Li and Minghui Wang and Zhewen Fan and Huanqing Feng},
  title = {L{OCSVMPSI}: a web server for subcellular localization of eukaryotic
	proteins using {SVM} and profile of {PSI}-{BLAST}.},
  journal = {Nucleic {A}cids {R}es.},
  year = {2005},
  volume = {33},
  pages = {W105-10},
  number = {Web Server issue},
  month = {Jul},
  abstract = {Subcellular location of a protein is one of the key functional characters
	as proteins must be localized correctly at the subcellular level
	to have normal biological function. {I}n this paper, a novel method
	named {LOCSVMPSI} has been introduced, which is based on the support
	vector machine ({SVM}) and the position-specific scoring matrix generated
	from profiles of {PSI}-{BLAST}. {W}ith a jackknife test on the {RH}2427
	data set, {LOCSVMPSI} achieved a high overall prediction accuracy
	of 90.2\%, which is higher than the prediction results by {S}ub{L}oc
	and {ESL}pred on this data set. {I}n addition, prediction performance
	of {LOCSVMPSI} was evaluated with 5-fold cross validation test on
	the {PK}7579 data set and the prediction results were consistently
	better than the previous method based on several {SVM}s using composition
	of both amino acids and amino acid pairs. {F}urther test on the {SWISSPROT}
	new-unique data set showed that {LOCSVMPSI} also performed better
	than some widely used prediction methods, such as {PSORTII}, {T}arget{P}
	and {LOC}net. {A}ll these results indicate that {LOCSVMPSI} is a
	powerful tool for the prediction of eukaryotic protein subcellular
	localization. {A}n online web server (current version is 1.3) based
	on this method has been developed and is freely available to both
	academic and commercial users, which can be accessed by at http://{B}ioinformatics.ustc.edu.cn/{LOCSVMPSI}/{LOCSVMPSI}.php.},
  doi = {10.1093/nar/gki359},
  pdf = {../local/Xie2005LOCSVMPSI.pdf},
  file = {Xie2005LOCSVMPSI.pdf:local/Xie2005LOCSVMPSI.pdf:PDF},
  keywords = {biosvm},
  pii = {33/suppl_2/W105},
  url = {http://dx.doi.org/10.1093/nar/gki359}
}
@article{Xie2009Unified,
  author = {Lei Xie and Li Xie and Philip E Bourne},
  title = {A unified statistical model to support local sequence order independent
	similarity searching for ligand-binding sites and its application
	to genome-based drug discovery.},
  journal = {Bioinformatics},
  year = {2009},
  volume = {25},
  pages = {i305--i312},
  number = {12},
  month = {Jun},
  abstract = {Functional relationships between proteins that do not share global
	structure similarity can be established by detecting their ligand-binding-site
	similarity. For a large-scale comparison, it is critical to accurately
	and efficiently assess the statistical significance of this similarity.
	Here, we report an efficient statistical model that supports local
	sequence order independent ligand-binding-site similarity searching.
	Most existing statistical models only take into account the matching
	vertices between two sites that are defined by a fixed number of
	points. In reality, the boundary of the binding site is not known
	or is dependent on the bound ligand making these approaches limited.
	To address these shortcomings and to perform binding-site mapping
	on a genome-wide scale, we developed a sequence-order independent
	profile-profile alignment (SOIPPA) algorithm that is able to detect
	local similarity between unknown binding sites a priori. The SOIPPA
	scoring integrates geometric, evolutionary and physical information
	into a unified framework. However, this imposes a significant challenge
	in assessing the statistical significance of the similarity because
	the conventional probability model that is based on fixed-point matching
	cannot be applied. Here we find that scores for binding-site matching
	by SOIPPA follow an extreme value distribution (EVD). Benchmark studies
	show that the EVD model performs at least two-orders faster and is
	more accurate than the non-parametric statistical method in the previous
	SOIPPA version. Efficient statistical analysis makes it possible
	to apply SOIPPA to genome-based drug discovery. Consequently, we
	have applied the approach to the structural genome of Mycobacterium
	tuberculosis to construct a protein-ligand interaction network. The
	network reveals highly connected proteins, which represent suitable
	targets for promiscuous drugs.},
  doi = {10.1093/bioinformatics/btp220},
  institution = {San Diego Supercomputer Center, University of California, San Diego,
	La Jolla, CA 92093, USA. lxie@sdsc.edu},
  keywords = {Binding Sites; Computational Biology, methods; Drug Discovery, methods;
	Genome; Ligands; Models, Statistical; Mycobacterium tuberculosis,
	genetics/metabolism; Proteins, chemistry},
  language = {eng},
  medline-pst = {ppublish},
  owner = {bricehoffmann},
  pii = {btp220},
  pmid = {19478004},
  timestamp = {2009.07.27},
  url = {http://dx.doi.org/10.1093/bioinformatics/btp220}
}
@article{Xing2004LOGOS,
  author = {Xing, E. P. and Wu, W. and Jordan, M. I. and Karp, R. M.},
  title = {L{OGOS}: {A} modular {B}ayesian model for de novo motif detection},
  journal = {J. {B}ioinform. {C}omput. {B}iol.},
  year = {2004},
  volume = {2},
  pages = {127--154},
  abstract = {The complexity of the global organization and internal structure of
	motifs in higher eukaryotic organisms raises significant challenges
	for motif detection techniques. {T}o achieve successful de novo motif
	detection, it is necessary to model the complex dependencies within
	and among motifs and to incorporate biological prior knowledge. {I}n
	this paper, we present {LOGOS}, an integrated {LO}cal and {G}l{O}bal
	motif {S}equence model for biopolymer sequences, which provides a
	principled framework for developing, modularizing, extending and
	computing expressive motif models for complex biopolymer sequence
	analysis. {LOGOS} consists of two interacting submodels: {HMDM},
	a local alignment model capturing biological prior knowledge and
	positional dependency within the motif local structure; and {HMM},
	a global motif distribution model modeling frequencies and dependencies
	of motif occurrences. {M}odel parameters can be fit using training
	motifs within an empirical {B}ayesian framework. {A} variational
	{EM} algorithm is developed for de novo motif detection. {LOGOS}
	improves over existing models that ignore biological priors and dependencies
	in motif structures and motif occurrences, and demonstrates superior
	performance on both semi-realistic test data and cis-regulatory sequences
	from yeast and {D}rosophila genomes with regard to sensitivity, specificity,
	flexibility and extensibility.},
  doi = {10.1142/S0219720004000508},
  pdf = {../local/Xing2004LOGOS.pdf},
  file = {Xing2004LOGOS.pdf:Xing2004LOGOS.pdf:PDF},
  keywords = {biogm},
  owner = {vert},
  timestamp = {2006.01.18},
  url = {http://dx.doi.org/10.1142/S0219720004000508}
}
@article{Xiong2001Biomarker,
  author = {Xiong, M. and Fang, X. and Zhao, J.},
  title = {Biomarker {I}dentification by {F}eature {W}rappers},
  journal = {Genome {R}es.},
  year = {2001},
  volume = {11},
  pages = {1878-1887},
  number = {11},
  abstract = {Gene expression studies bridge the gap between {DNA} information and
	trait information by dissecting biochemical pathways into intermediate
	components between genotype and phenotype. {T}hese studies open new
	avenues for identifying complex disease genes and biomarkers for
	disease diagnosis and for assessing drug efficacy and toxicity. {H}owever,
	the majority of analytical methods applied to gene expression data
	are not efficient for biomarker identification and disease diagnosis.
	{I}n this paper, we propose a general framework to incorporate feature
	(gene) selection into pattern recognition in the process to identify
	biomarkers. {U}sing this framework, we develop three feature wrappers
	that search through the space of feature subsets using the classification
	error as measure of goodness for a particular feature subset being
	"wrapped around": linear discriminant analysis, logistic regression,
	and support vector machines. {T}o effectively carry out this computationally
	intensive search process, we employ sequential forward search and
	sequential forward floating search algorithms. {T}o evaluate the
	performance of feature selection for biomarker identification we
	have applied the proposed methods to three data sets. {T}he preliminary
	results demonstrate that very high classification accuracy can be
	attained by identified composite classifiers with several biomarkers.},
  pdf = {../local/Xiong2001Biomarker.pdf},
  file = {Xiong2001Biomarker.pdf:local/Xiong2001Biomarker.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.genome.org/cgi/content/abstract/11/11/1878}
}
@article{Xu2004Molecular,
  author = {Xiu-Qin Xu and Chon K Leow and Xin Lu and Xuegong Zhang and Jun S
	Liu and Wing-Hung Wong and Arndt Asperger and Sören Deininger and
	Hon-Chiu Eastwood Leung},
  title = {Molecular classification of liver cirrhosis in a rat model by proteomics
	and bioinformatics.},
  journal = {Proteomics},
  year = {2004},
  volume = {4},
  pages = {3235-45},
  number = {10},
  month = {Oct},
  abstract = {Liver cirrhosis is a worldwide health problem. {R}eliable, noninvasive
	methods for early detection of liver cirrhosis are not available.
	{U}sing a three-step approach, we classified sera from rats with
	liver cirrhosis following different treatment insults. {T}he approach
	consisted of: (i) protein profiling using surface-enhanced laser
	desorption/ionization ({SELDI}) technology; (ii) selection of a statistically
	significant serum biomarker set using machine learning algorithms;
	and (iii) identification of selected serum biomarkers by peptide
	sequencing. {W}e generated serum protein profiles from three groups
	of rats: (i) normal (n=8), (ii) thioacetamide-induced liver cirrhosis
	(n=22), and (iii) bile duct ligation-induced liver fibrosis (n=5)
	using a weak cation exchanger surface. {P}rofiling data were further
	analyzed by a recursive support vector machine algorithm to select
	a panel of statistically significant biomarkers for class prediction.
	{S}ensitivity and specificity of classification using the selected
	protein marker set were higher than 92\%. {A} consistently down-regulated
	3495 {D}a protein in cirrhosis samples was one of the selected significant
	biomarkers. {T}his 3495 {D}a protein was purified on-chip and trypsin
	digested. {F}urther structural characterization of this biomarkers
	candidate was done by using cross-platform matrix-assisted laser
	desorption/ionization mass spectrometry ({MALDI}-{MS}) peptide mass
	fingerprinting ({PMF}) and matrix-assisted laser desorption/ionization
	time of flight/time of flight ({MALDI}-{TOF}/{TOF}) tandem mass spectrometry
	({MS}/{MS}). {C}ombined data from {PMF} and {MS}/{MS} spectra of
	two tryptic peptides suggested that this 3495 {D}a protein shared
	homology to a histidine-rich glycoprotein. {T}hese results demonstrated
	a novel approach to discovery of new biomarkers for early detection
	of liver cirrhosis and classification of liver diseases.},
  doi = {10.1002/pmic.200400839},
  pdf = {../local/Xu2004Molecular.pdf},
  file = {Xu2004Molecular.pdf:Xu2004Molecular.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1002/pmic.200400839}
}
@article{Xue2004Support,
  author = {C. X. Xue and R. S. Zhang and H. X. Liu and M. C. Liu and Z. D. Hu
	and B. T. Fan},
  title = {Support vector machines-based quantitative structure-property relationship
	for the prediction of heat capacity.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1267-74},
  number = {4},
  abstract = {The support vector machine ({SVM}), as a novel type of learning machine,
	for the first time, was used to develop a {Q}uantitative {S}tructure-{P}roperty
	{R}elationship ({QSPR}) model of the heat capacity of a diverse set
	of 182 compounds based on the molecular descriptors calculated from
	the structure alone. {M}ultiple linear regression ({MLR}) and radial
	basis function networks ({RBFNN}s) were also utilized to construct
	quantitative linear and nonlinear models to compare with the results
	obtained by {SVM}. {T}he root-mean-square (rms) errors in heat capacity
	predictions for the whole data set given by {MLR}, {RBFNN}s, and
	{SVM} were 4.648, 4.337, and 2.931 heat capacity units, respectively.
	{T}he prediction results are in good agreement with the experimental
	value of heat capacity; also, the results reveal the superiority
	of the {SVM} over {MLR} and {RBFNN}s models.},
  doi = {10.1021/ci049934n},
  pdf = {../local/Xue2004Support.pdf},
  file = {Xue2004Support.pdf:local/Xue2004Support.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049934n}
}
@article{Xue2004accurate,
  author = {C. X. Xue and R. S. Zhang and H. X. Liu and X. J. Yao and M. C. Liu
	and Z. D. Hu and B. T. Fan},
  title = {An accurate {QSPR} study of {O}-{H} bond dissociation energy in substituted
	phenols based on support vector machines.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {669-77},
  number = {2},
  abstract = {The support vector machine ({SVM}), as a novel type of learning machine,
	was used to develop a {Q}uantitative {S}tructure-{P}roperty {R}elationship
	({QSPR}) model of the {O}-{H} bond dissociation energy ({BDE}) of
	78 substituted phenols. {T}he six descriptors calculated solely from
	the molecular structures of compounds selected by forward stepwise
	regression were used as inputs for the {SVM} model. {T}he root-mean-square
	(rms) errors in {BDE} predictions for the training, test, and overall
	data sets were 3.808, 3.320, and 3.713 {BDE} units (k{J} mol(-1)),
	respectively. {T}he results obtained by {G}aussian-kernel {SVM} were
	much better than those obtained by multiple linear regression, radial
	basis function neural networks, linear-kernel {SVM}, and other {QSPR}
	approaches.},
  doi = {10.1021/ci034248u},
  pdf = {../local/Xue2004accurate.pdf},
  file = {Xue2004accurate.pdf:local/Xue2004accurate.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci034248u}
}
@article{Xue2004QSAR,
  author = {C. X. Xue and R. S. Zhang and H. X. Liu and X. J. Yao and M. C. Liu
	and Z. D. Hu and B. T. Fan},
  title = {Q{SAR} models for the prediction of binding affinities to human serum
	albumin using the heuristic method and a support vector machine.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1693-700},
  number = {5},
  abstract = {The binding affinities to human serum albumin for 94 diverse drugs
	and drug-like compounds were modeled with the descriptors calculated
	from the molecular structure alone using a quantitative structure-activity
	relationship ({QSAR}) technique. {T}he heuristic method ({HM}) and
	support vector machine ({SVM}) were utilized to construct the linear
	and nonlinear prediction models, leading to a good correlation coefficient
	({R}2) of 0.86 and 0.94 and root-mean-square errors (rms) of 0.212
	and 0.134 albumin drug binding affinity units, respectively. {F}urthermore,
	the models were evaluated by a 10 compound external test set, yielding
	{R}2 of 0.71 and 0.89 and rms error of 0.430 and 0.222. {T}he specific
	information described by the heuristic linear model could give some
	insights into the factors that are likely to govern the binding affinity
	of the compounds and be used as an aid to the drug design process;
	however, the prediction results of the nonlinear {SVM} model seem
	to be better than that of the {HM}.},
  doi = {10.1021/ci049820b},
  pdf = {../local/Xue2004QSAR.pdf},
  file = {Xue2004QSAR.pdf:local/Xue2004QSAR.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049820b}
}
@article{Xue2004Study,
  author = {C. X. Xue and R. S. Zhang and M. C. Liu and Z. D. Hu and B. T. Fan},
  title = {Study of the quantitative structure-mobility relationship of carboxylic
	acids in capillary electrophoresis based on support vector machines.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {950-7},
  number = {3},
  abstract = {The support vector machines ({SVM}), as a novel type of learning machine,
	were used to develop a quantitative structure-mobility relationship
	({QSMR}) model of 58 aliphatic and aromatic carboxylic acids based
	on molecular descriptors calculated from the structure alone. {M}ultiple
	linear regression ({MLR}) and radial basis function neural networks
	({RBFNN}s) were also utilized to construct the linear and the nonlinear
	model to compare with the results obtained by {SVM}. {T}he root-mean-square
	errors in absolute mobility predictions for the whole data set given
	by {MLR}, {RBFNN}s, and {SVM} were 1.530, 1.373, and 0.888 mobility
	units (10(-5) cm(2) {S}(-1) {V}(-1)), respectively, which indicated
	that the prediction result agrees well with the experimental values
	of these compounds and also revealed the superiority of {SVM} over
	{MLR} and {RBFNN}s models for the prediction of the absolute mobility
	of carboxylic acids. {M}oreover, the models we proposed could also
	provide some insight into what structural features are related to
	the absolute mobility of aliphatic and aromatic carboxylic acids.},
  doi = {10.1021/ci034280o},
  pdf = {../local/Xue2004Study.pdf},
  file = {Xue2004Study.pdf:local/Xue2004Study.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci034280o}
}
@article{Xue2001Fingerprint,
  author = {L. Xue and F. L. Stahura and J. W. Godden and J. Bajorath},
  title = {{F}ingerprint scaling increases the probability of identifying molecules
	with similar activity in virtual screening calculations.},
  journal = {J Chem Inf Comput Sci},
  year = {2001},
  volume = {41},
  pages = {746--753},
  number = {3},
  abstract = {Results of systematic virtual screening calculations using a structural
	key-type fingerprint are reported for compounds belonging to 14 activity
	classes added to randomly selected synthetic molecules. For each
	class, a fingerprint profile was calculated to monitor the relative
	occupancy of fingerprint bit positions. Consensus bit patterns were
	determined consisting of all bits that were always set on in compounds
	belonging to a specific activity class. In virtual screening calculations,
	scale factors were applied to each consensus bit position in fingerprints
	of query molecules. This technique, called "fingerprint scaling",
	effectively increases the weight of consensus bit positions in fingerprint
	comparisons. Although overall prediction accuracy was satisfactory
	using unscaled calculations, scaling significantly increased the
	number of correct predictions but only slightly increased the rate
	of false positives. These observations suggest that fingerprint scaling
	is an attractive approach to increase the probability of identifying
	molecules with similar activity by virtual screening. It requires
	the availability of a series of related compounds and can be easily
	applied to any keyed fingerprint representation that associates bit
	positions with specific molecular features.},
  keywords = {16S, Algae, Algorithms, Animals, Archaeal, Automation, Bacteria, Biodiversity,
	Chemical, Colorimetry, Computational Biology, Computer Terminals,
	DNA, DNA Fingerprinting, Daphnia, Databases, Ecosystem, Euryarchaeota,
	Factual, Fresh Water, Hazardous Substances, Humans, Information Storage
	and Retrieval, Methane, Models, Non-U.S. Gov't, Oxidoreductases,
	Perciformes, Photic Stimulation, Photometry, Polymorphism, Quantitative
	Structure-Activity Relationship, RNA, Research Support, Restriction
	Fragment Length, Ribosomal, Seasons, Soil Microbiology, Spain, Sulfur,
	Theoretical, Time Factors, Toxicity Tests, Water Microbiology, Water
	Pollutants, 11410055},
  owner = {mahe},
  pii = {ci000311t},
  pmid = {11410055},
  timestamp = {2006.09.03}
}
@article{Xue2004Effect,
  author = {Y. Xue and Z. R. Li and C. W. Yap and L. Z. Sun and X. Chen and Y.
	Z. Chen},
  title = {Effect of molecular descriptor feature selection in support vector
	machine classification of pharmacokinetic and toxicological properties
	of chemical agents.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1630-8},
  number = {5},
  abstract = {Statistical-learning methods have been developed for facilitating
	the prediction of pharmacokinetic and toxicological properties of
	chemical agents. {T}hese methods employ a variety of molecular descriptors
	to characterize structural and physicochemical properties of molecules.
	{S}ome of these descriptors are specifically designed for the study
	of a particular type of properties or agents, and their use for other
	properties or agents might generate noise and affect the prediction
	accuracy of a statistical learning system. {T}his work examines to
	what extent the reduction of this noise can improve the prediction
	accuracy of a statistical learning system. {A} feature selection
	method, recursive feature elimination ({RFE}), is used to automatically
	select molecular descriptors for support vector machines ({SVM})
	prediction of {P}-glycoprotein substrates ({P}-gp), human intestinal
	absorption of molecules ({HIA}), and agents that cause torsades de
	pointes ({T}d{P}), a rare but serious side effect. {RFE} significantly
	reduces the number of descriptors for each of these properties thereby
	increasing the computational speed for their classification. {T}he
	{SVM} prediction accuracies of {P}-gp and {HIA} are substantially
	increased and that of {T}d{P} remains unchanged by {RFE}. {T}hese
	prediction accuracies are comparable to those of earlier studies
	derived from a selective set of descriptors. {O}ur study suggests
	that molecular feature selection is useful for improving the speed
	and, in some cases, the accuracy of statistical learning methods
	for the prediction of pharmacokinetic and toxicological properties
	of chemical agents.},
  doi = {10.1021/ci049869h},
  pdf = {../local/Xue2004Effect.pdf},
  file = {Xue2004Effect.pdf:local/Xue2004Effect.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049869h}
}
@article{Xue2004Prediction,
  author = {Y. Xue and C. W. Yap and L. Z. Sun and Z. W. Cao and J. F. Wang and
	Y. Z. Chen},
  title = {Prediction of {P}-glycoprotein substrates by a support vector machine
	approach.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1497-505},
  number = {4},
  abstract = {P-glycoproteins ({P}-gp) actively transport a wide variety of chemicals
	out of cells and function as drug efflux pumps that mediate multidrug
	resistance and limit the efficacy of many drugs. {M}ethods for facilitating
	early elimination of potential {P}-gp substrates are useful for facilitating
	new drug discovery. {A} computational ensemble pharmacophore model
	has recently been used for the prediction of {P}-gp substrates with
	a promising accuracy of 63\%. {I}t is desirable to extend the prediction
	range beyond compounds covered by the known pharmacophore models.
	{F}or such a purpose, a machine learning method, support vector machine
	({SVM}), was explored for the prediction of {P}-gp substrates. {A}
	set of 201 chemical compounds, including 116 substrates and 85 nonsubstrates
	of {P}-gp, was used to train and test a {SVM} classification system.
	{T}his {SVM} system gave a prediction accuracy of at least 81.2\%
	for {P}-gp substrates based on two different evaluation methods,
	which is substantially improved against that obtained from the multiple-pharmacophore
	model. {T}he prediction accuracy for nonsubstrates of {P}-gp is 79.2\%
	using 5-fold cross-validation. {T}hese accuracies are slightly better
	than those obtained from other statistical classification methods,
	including k-nearest neighbor (k-{NN}), probabilistic neural networks
	({PNN}), and {C}4.5 decision tree, that use the same sets of data
	and molecular descriptors. {O}ur study indicates the potential of
	{SVM} in facilitating the prediction of {P}-gp substrates.},
  doi = {10.1021/ci049971e},
  pdf = {../local/Xue2004Prediction.pdf},
  file = {Xue2004Prediction.pdf:local/Xue2004Prediction.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1021/ci049971e}
}
@article{Yabuki2005GRIFFIN,
  author = {Yabuki, Y. and Muramatsu, T. and Hirokawa, T. and Mukai, H. and Suwa,
	M.},
  title = {G{RIFFIN}: a system for predicting {GPCR}-{G}-protein coupling selectivity
	using a support vector machine and a hidden {M}arkov model.},
  journal = {Nucleic {A}cids {R}es.},
  year = {2005},
  volume = {33},
  pages = {W148-53},
  number = {Web Server issue},
  month = {Jul},
  abstract = {We describe a novel system, {GRIFFIN} ({G}-protein and {R}eceptor
	{I}nteraction {F}eature {F}inding {IN}strument), that predicts {G}-protein
	coupled receptor ({GPCR}) and {G}-protein coupling selectivity based
	on a support vector machine ({SVM}) and a hidden {M}arkov model ({HMM})
	with high sensitivity and specificity. {B}ased on our assumption
	that whole structural segments of ligands, {GPCR}s and {G}-proteins
	are essential to determine {GPCR} and {G}-protein coupling, various
	quantitative features were selected for ligands, {GPCR}s and {G}-protein
	complex structures, and those parameters that are the most effective
	in selecting {G}-protein type were used as feature vectors in the
	{SVM}. {T}he main part of {GRIFFIN} includes a hierarchical {SVM}
	classifier using the feature vectors, which is useful for {C}lass
	{A} {GPCR}s, the major family. {F}or the opsins and olfactory subfamilies
	of {C}lass {A} and other minor families ({C}lasses {B}, {C}, frizzled
	and smoothened), the binding {G}-protein is predicted with high accuracy
	using the {HMM}. {A}pplying this system to known {GPCR} sequences,
	each binding {G}-protein is predicted with high sensitivity and specificity
	(>85\% on average). {GRIFFIN} (http://griffin.cbrc.jp/) is freely
	available and allows users to easily execute this reliable prediction
	of {G}-proteins.},
  doi = {10.1093/nar/gki495},
  pdf = {../local/Yabuki2005GRIFFIN.pdf},
  file = {Yabuki2005GRIFFIN.pdf:local/Yabuki2005GRIFFIN.pdf:PDF},
  keywords = {biosvm},
  pii = {33/suppl_2/W148},
  url = {http://dx.doi.org/10.1093/nar/gki495}
}
@incollection{Yamanishi2004Heterogeneous,
  author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.},
  title = {Heterogeneous data comparison and gene selection with kernel canonical
	correlation analysis},
  booktitle = {Kernel {M}ethods in {C}omputational {B}iology},
  publisher = {MIT Press},
  year = {2004},
  editor = {Schölkopf, B. and Tsuda, K. and Vert, J.P.},
  pages = {209-230},
  pdf = {../local/heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\},
  file = {heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF;heterogeneous.pdf:http\://cg.ensmp.fr/~vert/publi/04kmcbbook/heterogeneous.pdf:PDF},
  keywords = {biosvm},
  owner = {vert}
}
@article{Yamanishi2005Supervised,
  author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.},
  title = {Supervised enzyme network inference from the integration of genomic
	data and chemical information},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i468-i477},
  abstract = {Motivation: {T}he metabolic network is an important biological network
	which relates enzyme proteins and chemical compounds. {A} large number
	of metabolic pathways remain unknown nowadays, and many enzymes are
	missing even in known metabolic pathways. {T}here is, therefore,
	an incentive to develop methods to reconstruct the unknown parts
	of the metabolic network and to identify genes coding for missing
	enzymes. {R}esults: {T}his paper presents new methods to infer enzyme
	networks from the integration of multiple genomic data and chemical
	information, in the framework of supervised graph inference. {T}he
	originality of the methods is the introduction of chemical compatibility
	as a constraint for refining the network predicted by the network
	inference engine. {T}he chemical compatibility between two enzymes
	is obtained automatically from the information encoded by their {E}nzyme
	{C}ommission ({EC}) numbers. {T}he proposed methods are tested and
	compared on their ability to infer the enzyme network of the yeast
	{S}accharomyces cerevisiae from four datasets for enzymes with assigned
	{EC} numbers: gene expression data, protein localization data, phylogenetic
	profiles and chemical compatibility information. {I}t is shown that
	the prediction accuracy of the network reconstruction consistently
	improves owing to the introduction of chemical constraints, the use
	of a supervised approach and the weighted integration of multiple
	datasets. {F}inally, we conduct a comprehensive prediction of a global
	enzyme network consisting of all enzyme candidate proteins of the
	yeast to obtain new biological findings.},
  doi = {10.1093/bioinformatics/bti1012},
  pdf = {../local/Yamanishi2005Supervised.pdf},
  file = {Yamanishi2005Supervised.pdf:local/Yamanishi2005Supervised.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1012}
}
@article{Yamanishi2004Protein,
  author = {Yamanishi, Y. and Vert, J.-P. and Kanehisa, M.},
  title = {Protein network inference from multiple genomic data: a supervised
	approach},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {i363-i370},
  abstract = {Motivation: {A}n increasing number of observations support the hypothesis
	that most biological functions involve the interactions between many
	proteins, and that the complexity of living systems arises as a result
	of such interactions. {I}n this context, the problem of inferring
	a global protein network for a given organism, using all available
	genomic data about the organism, is quickly becoming one of the main
	challenges in current computational biology. {R}esults: {T}his paper
	presents a new method to infer protein networks from multiple types
	of genomic data. {B}ased on a variant of kernel canonical correlation
	analysis, its originality is in the formalization of the protein
	network inference problem as a supervised learning problem, and in
	the integration of heterogeneous genomic data within this framework.
	{W}e present promising results on the prediction of the protein network
	for the yeast {S}accharomyces cerevisiae from four types of widely
	available data: gene expressions, protein interactions measured by
	yeast two-hybrid systems, protein localizations in the cell and protein
	phylogenetic profiles. {T}he method is shown to outperform other
	unsupervised protein network inference methods. {W}e finally conduct
	a comprehensive prediction of the protein network for all proteins
	of the yeast, which enables us to propose protein candidates for
	missing enzymes in a biosynthesis pathway. {A}vailability: {S}oftwares
	are available upon request.},
  pdf = {../local/Yamanishi2004Protein.pdf},
  file = {Yamanishi2004Protein.pdf:local/Yamanishi2004Protein.pdf:PDF},
  keywords = {biosvm},
  owner = {vert},
  url = {http://bioinformatics.oupjournals.org/cgi/reprint/19/suppl\_1/i323}
}
@article{Yamanishi2003Extraction,
  author = {Yamanishi, Y. and Vert, J.-P. and Nakaya, A. and Kanehisa, M.},
  title = {Extraction of correlated gene clusters from multiple genomic data
	by generalized kernel canonical correlation analysis},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {i323-i330},
  number = {Suppl. 1},
  abstract = {Motivation: {A} major issue in computational biology is the reconstruction
	of pathways from several genomic datasets, such as expression data,
	protein interaction data and phylogenetic profiles. {A}s a first
	step toward this goal, it is important to investigate the amount
	of correlation which exists between these data. {R}esults: {T}hese
	methods are successfully tested on their ability to recognize operons
	in the {E}scherichia coli genome, from the comparison of three datasets
	corresponding to functional relationships between genes in metabolic
	pathways, geometrical relationships along the chromosome, and co-expression
	relationships as observed by gene expression data. {C}ontact: yoshi@kuicr.kyoto-u.ac.jp},
  pdf = {../local/Yamanishi2003Extraction.pdf},
  file = {Yamanishi2003Extraction.pdf:local/Yamanishi2003Extraction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/suppl_1/i323}
}
@article{Yan2004two-stage,
  author = {Yan, C. and Dobbs, D. and Honavar, V.},
  title = {A two-stage classifier for identification of protein-protein interface
	residues},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {i371-i378},
  number = {Suppl. 1},
  abstract = {Motivation: {T}he ability to identify protein-protein interaction
	sites and to detect specific amino acid residues that contribute
	to the specificity and affinity of protein interactions has important
	implications for problems ranging from rational drug design to analysis
	of metabolic and signal transduction networks. {R}esults: {W}e have
	developed a two-stage method consisting of a support vector machine
	({SVM}) and a {B}ayesian classifier for predicting surface residues
	of a protein that participate in protein-protein interactions. {T}his
	approach exploits the fact that interface residues tend to form clusters
	in the primary amino acid sequence. {O}ur results show that the proposed
	two-stage classifier outperforms previously published sequence-based
	methods for predicting interface residues. {W}e also present results
	obtained using the two-stage classifier on an independent test set
	of seven {CAPRI} ({C}ritical {A}ssessment of {PR}edicted {I}nteractions)
	targets. {T}he success of the predictions is validated by examining
	the predictions in the context of the three-dimensional structures
	of protein complexes. {S}upplementary information: http://www.public.iastate.edu/~chhyan/{ISMB}2004/list.html},
  pdf = {../local/Yan2004two-stage.pdf},
  file = {Yan2004two-stage.pdf:local/Yan2004two-stage.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/suppl_1/i371}
}
@article{Yan2004Identification,
  author = {Yan, C. and Honavar, V. and Dobbs, D.},
  title = {Identification of interface residues in protease-inhibitor and antigen-antibody
	complexes: a support vector machine},
  journal = {Neural {C}omput. \& {A}pplic.},
  year = {2004},
  volume = {13},
  pages = {123-129},
  doi = {10.1007/s00521-004-0414-3},
  pdf = {../local/Yan2004Identification.pdf},
  file = {Yan2004Identification.pdf:local/Yan2004Identification.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Yan2007Determining,
  author = {Yan, Mingjin and Ye, Keying},
  title = {Determining the number of clusters using the weighted gap statistic.},
  journal = {Biometrics},
  year = {2007},
  volume = {63},
  pages = {1031--1037},
  number = {4},
  month = {Dec},
  abstract = {Estimating the number of clusters in a data set is a crucial step
	in cluster analysis. In this article, motivated by the gap method
	(Tibshirani, Walther, and Hastie, 2001, Journal of the Royal Statistical
	Society B63, 411-423), we propose the weighted gap and the difference
	of difference-weighted (DD-weighted) gap methods for estimating the
	number of clusters in data using the weighted within-clusters sum
	of errors: a measure of the within-clusters homogeneity. In addition,
	we propose a "multilayer" clustering approach, which is shown to
	be more accurate than the original gap method, particularly in detecting
	the nested cluster structure of the data. The methods are applicable
	when the input data contain continuous measurements and can be used
	with any clustering method. Simulation studies and real data are
	investigated and compared among these proposed methods as well as
	with the original gap method.},
  doi = {10.1111/j.1541-0420.2007.00784.x},
  institution = {Medtronic Sofamor Danek, 1800 Pyramid Place, Memphis, Tennessee 38132,
	USA. mingjin.yan@medtronic.com},
  keywords = {Algorithms; Biometry, methods; Cluster Analysis; Computer Simulation;
	Data Interpretation, Statistical; Models, Biological; Models, Statistical;
	Pattern Recognition, Automated, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {jp},
  pii = {BIOM784},
  pmid = {17425640},
  timestamp = {2011.12.29},
  url = {http://dx.doi.org/10.1111/j.1541-0420.2007.00784.x}
}
@article{Yang2004Bio-support,
  author = {Yang, Z. R. and Chou, K.-C.},
  title = {Bio-support vector machines for computational proteomics},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {735-741},
  number = {5},
  abstract = {Motivation: {O}ne of the most important issues in computational proteomics
	is to produce a prediction model for the classification or annotation
	of biological function of novel protein sequences. {I}n order to
	improve the prediction accuracy, much attention has been paid to
	the improvement of the performance of the algorithms used, few is
	for solving the fundamental issue, namely, amino acid encoding as
	most existing pattern recognition algorithms are unable to recognize
	amino acids in protein sequences. {I}mportantly, the most commonly
	used amino acid encoding method has the flaw that leads to large
	computational cost and recognition bias. {R}esults: {B}y replacing
	kernel functions of support vector machines ({SVM}s) with amino acid
	similarity measurement matrices, we have modified {SVM}s, a new type
	of pattern recognition algorithm for analysing protein sequences,
	particularly for proteolytic cleavage site prediction. {W}e refer
	to the modified {SVM}s as bio-support vector machine. {W}hen applied
	to the prediction of {HIV} protease cleavage sites, the new method
	has shown a remarkable advantage in reducing the model complexity
	and enhancing the model robustness.},
  pdf = {../local/Yang2004Bio-support.pdf},
  file = {Yang2004Bio-support.pdf:local/Yang2004Bio-support.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/20/5/735}
}
@article{Yao2004Comparative,
  author = {X. J. Yao and A. Panaye and J. P. Doucet and R. S. Zhang and H. F.
	Chen and M. C. Liu and Z. D. Hu and B. T. Fan},
  title = {Comparative study of {QSAR}/{QSPR} correlations using support vector
	machines, radial basis function neural networks, and multiple linear
	regression.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2004},
  volume = {44},
  pages = {1257-66},
  number = {4},
  abstract = {Support vector machines ({SVM}s) were used to develop {QSAR} models
	that correlate molecular structures to their toxicity and bioactivities.
	{T}he performance and predictive ability of {SVM} are investigated
	and compared with other methods such as multiple linear regression
	and radial basis function neural network methods. {I}n the present
	study, two different data sets were evaluated. {T}he first one involves
	an application of {SVM} to the development of a {QSAR} model for
	the prediction of toxicities of 153 phenols, and the second investigation
	deals with the {QSAR} model between the structures and the activities
	of a set of 85 cyclooxygenase 2 ({COX}-2) inhibitors. {F}or each
	application, the molecular structures were described using either
	the physicochemical parameters or molecular descriptors. {I}n both
	studied cases, the predictive ability of the {SVM} model is comparable
	or superior to those obtained by {MLR} and {RBFNN}. {T}he results
	indicate that {SVM} can be used as an alternative powerful modeling
	tool for {QSAR} studies.},
  doi = {10.1021/ci049965i},
  pdf = {../local/Yao2004Comparative.pdf},
  file = {Yao2004Comparative.pdf:local/Yao2004Comparative.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci049965i}
}
@article{Yap2004Prediction,
  author = {C. W. Yap and C. Z. Cai and Y. Xue and Y. Z. Chen},
  title = {Prediction of torsade-causing potential of drugs by support vector
	machine approach.},
  journal = {Toxicol {S}ci},
  year = {2004},
  volume = {79},
  pages = {170-7},
  number = {1},
  month = {May},
  abstract = {In an effort to facilitate drug discovery, computational methods for
	facilitating the prediction of various adverse drug reactions ({ADR}s)
	have been developed. {S}o far, attention has not been sufficiently
	paid to the development of methods for the prediction of serious
	{ADR}s that occur less frequently. {S}ome of these {ADR}s, such as
	torsade de pointes ({T}d{P}), are important issues in the approval
	of drugs for certain diseases. {T}hus there is a need to develop
	tools for facilitating the prediction of these {ADR}s. {T}his work
	explores the use of a statistical learning method, support vector
	machine ({SVM}), for {T}d{P} prediction. {T}d{P} involves multiple
	mechanisms and {SVM} is a method suitable for such a problem. {O}ur
	{SVM} classification system used a set of linear solvation energy
	relationship ({LSER}) descriptors and was optimized by leave-one-out
	cross validation procedure. {I}ts prediction accuracy was evaluated
	by using an independent set of agents and by comparison with results
	obtained from other commonly used classification methods using the
	same dataset and optimization procedure. {T}he accuracies for the
	{SVM} prediction of {T}d{P}-causing agents and non-{T}d{P}-causing
	agents are 97.4 and 84.6\% respectively; one is substantially improved
	against and the other is comparable to the results obtained by other
	classification methods useful for multiple-mechanism prediction problems.
	{T}his indicates the potential of {SVM} in facilitating the prediction
	of {T}d{P}-causing risk of small molecules and perhaps other {ADR}s
	that involve multiple mechanisms.},
  doi = {10.1093/toxsci/kfh082},
  pdf = {../local/Yap2004Prediction.pdf},
  file = {Yap2004Prediction.pdf:local/Yap2004Prediction.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  pii = {kfh082},
  url = {http://dx.doi.org/10.1093/toxsci/kfh082}
}
@article{Yap2005Prediction,
  author = {C. W. Yap and Y. Z. Chen},
  title = {Prediction of {C}ytochrome {P}450 3{A}4, 2{D}6, and 2{C}9 {I}nhibitors
	and {S}ubstrates by {U}sing {S}upport {V}ector {M}achines.},
  journal = {J {C}hem {I}nf {M}odel},
  year = {2005},
  volume = {45},
  pages = {982-92},
  number = {4},
  abstract = {Statistical learning methods have been used in developing filters
	for predicting inhibitors of two {P}450 isoenzymes, {CYP}3{A}4 and
	{CYP}2{D}6. {T}his work explores the use of different statistical
	learning methods for predicting inhibitors of these enzymes and an
	additional {P}450 enzyme, {CYP}2{C}9, and the substrates of the three
	{P}450 isoenzymes. {T}wo consensus support vector machine ({CSVM})
	methods, "positive majority" ({PM}-{CSVM}) and "positive probability"
	({PP}-{CSVM}), were used in this work. {T}hese methods were first
	tested for the prediction of inhibitors of {CYP}3{A}4 and {CYP}2{D}6
	by using a significantly higher number of inhibitors and noninhibitors
	than that used in earlier studies. {T}hey were then applied to the
	prediction of inhibitors of {CYP}2{C}9 and substrates of the three
	enzymes. {B}oth methods predict inhibitors of {CYP}3{A}4 and {CYP}2{D}6
	at a similar level of accuracy as those of earlier studies. {F}or
	classification of inhibitors of {CYP}2{C}9, the best {CSVM} method
	gives an accuracy of 88.9\% for inhibitors and 96.3\% for noninhibitors.
	{T}he accuracies for classification of substrates and nonsubstrates
	of {CYP}3{A}4, {CYP}2{D}6, and {CYP}2{C}9 are 98.2 and 90.9\%, 96.6
	and 94.4\%, and 85.7 and 98.8\%, respectively. {B}oth {CSVM} methods
	are potentially useful as filters for predicting inhibitors and substrates
	of {P}450 isoenzymes. {T}hese methods generally give better accuracies
	than single {SVM} classification systems, and the performance of
	the {PP}-{CSVM} method is slightly better than that of the {PM}-{CSVM}
	method.},
  doi = {10.1021/ci0500536},
  pdf = {../local/Yap2005Prediction.pdf},
  file = {Yap2005Prediction.pdf:local/Yap2005Prediction.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci0500536}
}
@article{Yeang2001Molecular,
  author = {Yeang, C.H. and Ramaswamy, S. and Tamayo, P. and Mukherjee, S. and
	Rifkin, R.M. and Angelo, M. and Reich, M. and Lander, E. and Mesirov,
	J. and Golub, T.},
  title = {Molecular classification of multiple tumor types},
  journal = {Bioinformatics},
  year = {2001},
  volume = {17},
  pages = {S316--S322},
  number = {Suppl. 1},
  abstract = {Using gene expression data to classify tumor types is a very promising
	tool in cancer diagnosis. {P}revious works show several pairs of
	tumor types can be successfully distinguished by their gene expression
	patterns ({G}olub et al. 1999, {B}en-{D}or et al. 2000, {A}lizadeh
	et al. 2000). {H}owever, the simultaneous classification across a
	heterogeneous set of tumor types has not been well studied yet. {W}e
	obtained 190 samples from 14 tumor classes and generated a combined
	expression dataset containing 16063 genes for each of those samples.
	{W}e performed multi-class classification by combining the outputs
	of binary classifiers. {T}hree binary classifiers (k-nearest neighbors,
	weighted voting, and support vector machines) were applied in conjunction
	with three combination scenarios (one-vs-all, all-pairs, hierarchical
	partitioning). {W}e achieved the best cross validation error rate
	of 18.75% and the best test error rate of 21.74% by using the one-vs-all
	support vector machine algorithm. {T}he results demonstrate the feasibility
	of performing clinically useful classification from samples of multiple
	tumor types.},
  pdf = {../local/Yeang2001Molecular.pdf},
  file = {Yeang2001Molecular.pdf:local/Yeang2001Molecular.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/17/suppl_1/S316}
}
@article{Yiu2005Filtering,
  author = {Yiu, S. M. and Wong, Prudence W. H. and Lam, T.W. and Mui, Y.C. and
	Kung, H. F. and Lin, Marie and Cheung, Y. T.},
  title = {Filtering of {I}neffective si{RNA}s and {I}mproved si{RNA} {D}esign
	{T}ool},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {144-151},
  number = {2},
  month = {Jan},
  note = {To appear},
  abstract = {Motivation: {S}hort interfering {RNA}s (si{RNA}s) can be used to suppress
	gene expression and possess many potential applications in therapy,
	but how to design an effective si{RNA} is still not clear. {B}ased
	on the {MPI} ({M}ax-{P}lanck-{I}nstitute) basic principles, a number
	of si{RNA} design tools have been developed recently. {T}he set of
	candidates reported by these tools is usually large and often contains
	ineffective si{RNA}s. {I}n view of this, we initiate the study of
	filtering ineffective si{RNA}s. {R}esults: {T}he contribution of
	this paper is 2-fold. {F}irst, we propose a fair scheme to compare
	existing design tools based on real data in the literature. {S}econd,
	we attempt to improve the {MPI} principles and existing tools by
	an algorithm that can filter ineffective si{RNA}s. {T}he algorithm
	is based on some new observations on the secondary structure, which
	we have verified by {AI} techniques (decision trees and support vector
	machines). {W}e have tested our algorithm together with the {MPI}
	principles and the existing tools. {T}he results show that our filtering
	algorithm is effective. {A}vailability: {T}he si{RNA} design software
	tool can be found in the website http://www.cs.hku.hk/~sirna/ {C}ontact:
	smyiu@cs.hku.hk},
  doi = {10.1093/bioinformatics/bth498},
  pdf = {../local/Yiu2005Filtering.pdf},
  file = {Yiu2005Filtering.pdf:local/Yiu2005Filtering.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/21/2/144}
}
@article{Yoon2003Analysis,
  author = {Yoon, Y. and Song, J. and Hong, S.H. and Kim, J.Q.},
  title = {Analysis of multiple single nucleotide polymorphisms of candidate
	genes related to coronary heart disease susceptibility by using support
	vector machines},
  journal = {Clin. {C}hem. {L}ab. {M}ed.},
  year = {2003},
  volume = {41},
  pages = {529-534},
  number = {4},
  abstract = {Coronary heart disease ({CHD}) is a complex genetic disease involving
	gene-environment interaction. {M}any association studies between
	single nucleotide polymorphisms ({SNP}s) of candidate genes and {CHD}
	have been reported. {W}e have applied a new method to analyze such
	relationships using support vector machines ({SVM}s), which is one
	of the methods for artificial neuronal network. {W}e assumed that
	common haplotype implicit in genotypes will differ between cases
	and controls, and that this will allow {SVM}-derived patterns to
	be classifiable according to subject genotypes. {F}ourteen {SNP}s
	of ten candidate genes in 86 {CHD} patients and 119 controls were
	investigated. {G}enotypes were transformed to a numerical vector
	by giving scores based on difference between the genotypes of each
	subject and the reference genotypes, which represent the healthy
	normal population. {O}verall classification accuracy by {SVM}s was
	64.4% with a receiver operating characteristic ({ROC}) area of 0.639.
	{B}y conventional analysis using the chi2 test, the association between
	{CHD} and the {SNP} of the scavenger receptor {B}1 gene was most
	significant in terms of allele frequencies in cases vs. controls
	(p = 0.0001). {I}n conclusion, we suggest that the application of
	{SVM}s for association studies of {SNP}s in candidate genes shows
	considerable promise and that further work could be usefully performed
	upon the estimation of {CHD} susceptibility in individuals of high
	risk.},
  pdf = {../local/Yoon2003Analysis.pdf},
  file = {Yoon2003Analysis.pdf:local/Yoon2003Analysis.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.degruyter.de/journals/cclm/abs/10592.html}
}
@article{Yu2003Fine-grained,
  author = {Yu, C.S. and Wang, J.Y. and Yang, J.M. and Lyu, P.C. and Lin, C.J.
	and Hwang, J.K.},
  title = {Fine-grained protein fold assignment by support vector machines using
	generalized npeptide coding schemes and jury voting from multiple-parameter
	sets.},
  journal = {Proteins},
  year = {2003},
  volume = {50},
  pages = {531},
  number = {4},
  month = {6},
  abstract = {In the coarse-grained fold assignment of major protein classes, such
	as all-alpha, all-beta, alpha + beta, alpha/beta proteins, one can
	easily achieve high prediction accuracy from primary amino acid sequences.
	{H}owever, the fine-grained assignment of folds, such as those defined
	in the {S}tructural {C}lassification of {P}roteins ({SCOP}) database,
	presents a challenge due to the larger amount of folds available.
	{R}ecent study yielded reasonable prediction accuracy of 56.0% on
	an independent set of 27 most populated folds. {I}n this communication,
	we apply the support vector machine ({SVM}) method, using a combination
	of protein descriptors based on the properties derived from the composition
	of n-peptide and jury voting, to the fine-grained fold prediction,
	and are able to achieve an overall prediction accuracy of 69.6% on
	the same independent set-significantly higher than the previous results.
	{O}n 10-fold cross-validation, we obtained a prediction accuracy
	of 65.3%. {O}ur results show that {SVM} coupled with suitable global
	sequence-coding schemes can significantly improve the fine-grained
	fold prediction. {O}ur approach should be useful in structure prediction
	and modeling.},
  doi = {10.1002/prot.10313},
  pdf = {../local/Yu2003Fine-grained.pdf},
  file = {Yu2003Fine-grained.pdf:local/Yu2003Fine-grained.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.10313}
}
@article{Yu2005Classifying,
  author = {Yu, C. and Zavaljevski, N. and Stevens, F. J. and Yackovich, K. and
	Reifman, J.},
  title = {Classifying noisy protein sequence data: a case study of immunoglobulin
	light chains.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {i495-i501},
  number = {Supp 1},
  month = {Jun},
  abstract = {S{UMMARY}: {T}he classification of protein sequences obtained from
	patients with various immunoglobulin-related conformational diseases
	may provide insight into structural correlates of pathogenicity.
	{H}owever, clinical data are very sparse and, in the case of antibody-related
	proteins, the collected sequences have large variability with only
	a small subset of variations relevant to the protein pathogenicity
	(function). {O}n this basis, these sequences represent a model system
	for development of strategies to recognize the small subset of function-determining
	variations among the much larger number of primary structure diversifications
	introduced during evolution. {U}nder such conditions, most protein
	classification algorithms have limited accuracy. {T}o address this
	problem, we propose a support vector machine ({SVM})-based classifier
	that combines sequence and 3{D} structural averaging information.
	{E}ach amino acid in the sequence is represented by a set of six
	physicochemical properties: hydrophobicity, hydrophilicity, volume,
	surface area, bulkiness and refractivity. {E}ach position in the
	sequence is described by the properties of the amino acid at that
	position and the properties of its neighbors in 3{D} space or in
	the sequence. {A} structure template is selected to determine neighbors
	in 3{D} space and a window size is used to determine the neighbors
	in the sequence. {T}he test data consist of 209 proteins of human
	antibody immunoglobulin light chains, each represented by aligned
	sequences of 120 amino acids. {T}he methodology is applied to the
	classification of protein sequences collected from patients with
	and without amyloidosis, and indicates that the proposed modified
	classifiers are more robust to sequence variability than standard
	{SVM} classifiers, improving classification error between 5 and 25\%
	and sensitivity between 9 and 17\%. {T}he classification results
	might also suggest possible mechanisms for the propensity of immunoglobulin
	light chains to amyloid formation. {CONTACT}: cyu@bioanalysis.org.},
  doi = {10.1093/bioinformatics/bti1024},
  pdf = {../local/Yu2005Classifying.pdf},
  file = {Yu2005Classifying.pdf:local/Yu2005Classifying.pdf:PDF},
  keywords = {biosvm},
  pii = {21/suppl_1/i495},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti1024}
}
@article{Yu2004Predicting,
  author = {Yu, C.-S. and Lin, C.-J. and Hwang, J.-K.},
  title = {Predicting subcellular localization of proteins for {G}ram-negative
	bacteria by support vector machines based on n-peptide compositions},
  journal = {Protein {S}ci.},
  year = {2004},
  volume = {13},
  pages = {1402-1406},
  number = {5},
  abstract = {Gram-negative bacteria have five major subcellular localization sites:
	the cytoplasm, the periplasm, the inner membrane, the outer membrane,
	and the extracellular space. {T}he subcellular location of a protein
	can provide valuable information about its function. {W}ith the rapid
	increase of sequenced genomic data, the need for an automated and
	accurate tool to predict subcellular localization becomes increasingly
	important. {W}e present an approach to predict subcellular localization
	for {G}ram-negative bacteria. {T}his method uses the support vector
	machines trained by multiple feature vectors based on n-peptide compositions.
	{F}or a standard data set comprising 1443 proteins, the overall prediction
	accuracy reaches 89%, which, to the best of our knowledge, is the
	highest prediction rate ever reported. {O}ur prediction is 14% higher
	than that of the recently developed multimodular {PSORT}-{B}. {B}ecause
	of its simplicity, this approach can be easily extended to other
	organisms and should be a useful tool for the high-throughput and
	large-scale analysis of proteomic and genomic data.},
  doi = {10.1110/ps.03479604},
  pdf = {../local/Yu2004Predicting.pdf},
  file = {Yu2004Predicting.pdf:local/Yu2004Predicting.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.proteinscience.org/cgi/content/abstract/13/5/1402}
}
@article{Yu2004integrated,
  author = {Yu, J.K. and Chen, Y.D. and Zheng, S.},
  title = {An integrated approach to the detection of colorectal cancer utilizing
	proteomics and bioinformatics},
  journal = {World {J}. {G}astroenterol.},
  year = {2004},
  volume = {10},
  pages = {3127-3131},
  number = {21},
  abstract = {A{IM}: {T}o find new potential biomarkers and to establish patterns
	for early detection of colorectal cancer. {METHODS}: {O}ne hundred
	and eighty-two serum samples including 55 from colorectal cancer
	({CRC}) patients, 35 from colorectal adenoma ({CRA}) patients and
	92 from healthy persons ({HP}) were detected by surface-enhanced
	laser desorption/ionization mass spectrometry ({SELDI}-{MS}). {T}he
	data of spectra were analyzed by bioinformatics tools like artificial
	neural network ({ANN}) and support vector machine ({SVM}). {RESULTS}:
	{T}he diagnostic pattern combined with 7 potential biomarkers could
	differentiate {CRC} patients from {CRA} patients with a specificity
	of 83%, sensitivity of 89% and positive predictive value of 89%.
	{T}he diagnostic pattern combined with 4 potential biomarkers could
	differentiate {CRC} patients from {HP} with a specificity of 92%,
	sensitivity of 89% and positive predictive value of 86%. {CONCLUSION}:
	{T}he combination of {SELDI} with bioinformatics tools could help
	find new biomarkers and establish patterns with high sensitivity
	and specificity for the detection of {CRC}.},
  pdf = {../local/Yu2004integrated.pdf},
  file = {Yu2004integrated.pdf:local/Yu2004integrated.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert}
}
@article{Yu2005Ovarian,
  author = {J. S. Yu and S. Ongarello and R. Fiedler and X. W. Chen and G. Toffolo
	and C. Cobelli and Z. Trajanoski},
  title = {Ovarian cancer identification based on dimensionality reduction for
	high-throughput mass spectrometry data.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2200-9},
  number = {10},
  month = {May},
  abstract = {M{OTIVATION}: {H}igh-throughput and high-resolution mass spectrometry
	instruments are increasingly used for disease classification and
	therapeutic guidance. {H}owever, the analysis of immense amount of
	data poses considerable challenges. {W}e have therefore developed
	a novel method for dimensionality reduction and tested on a published
	ovarian high-resolution {SELDI}-{TOF} dataset. {RESULTS}: {W}e have
	developed a four-step strategy for data preprocessing based on: (1)
	binning, (2) {K}olmogorov-{S}mirnov test, (3) restriction of coefficient
	of variation and (4) wavelet analysis. {S}ubsequently, support vector
	machines were used for classification. {T}he developed method achieves
	an average sensitivity of 97.38\% (sd = 0.0125) and an average specificity
	of 93.30\% (sd = 0.0174) in 1000 independent k-fold cross-validations,
	where k = 2, ..., 10. {AVAILABILITY}: {T}he software is available
	for academic and non-commercial institutions.},
  doi = {10.1093/bioinformatics/bti370},
  pdf = {../local/Yu2005Ovarian.pdf},
  file = {Yu2005Ovarian.pdf:local/Yu2005Ovarian.pdf:PDF},
  keywords = {biosvm proteomics},
  pii = {bti370},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti370}
}
@article{Yu2005integrated,
  author = {Yu, J.-k. and Zheng, S. and Tang, Y. and Li, L.},
  title = {An integrated approach utilizing proteomics and bioinformatics to
	detect ovarian cancer.},
  journal = {J {Z}hejiang {U}niv {S}ci {B}},
  year = {2005},
  volume = {6},
  pages = {227-31},
  number = {4},
  month = {Apr},
  abstract = {O{BJECTIVE}: {T}o find new potential biomarkers and establish the
	patterns for the detection of ovarian cancer. {METHODS}: {S}ixty
	one serum samples including 32 ovarian cancer patients and 29 healthy
	people were detected by surface-enhanced laser desorption/ionization
	mass spectrometry ({SELDI}-{MS}). {T}he protein fingerprint data
	were analyzed by bioinformatics tools. {T}en folds cross-validation
	support vector machine ({SVM}) was used to establish the diagnostic
	pattern. {RESULTS}: {F}ive potential biomarkers were found (2085
	{D}a, 5881 {D}a, 7564 {D}a, 9422 {D}a, 6044 {D}a), combined with
	which the diagnostic pattern separated the ovarian cancer from the
	healthy samples with a sensitivity of 96.7\%, a specificity of 96.7\%
	and a positive predictive value of 96.7\%. {CONCLUSIONS}: {T}he combination
	of {SELDI} with bioinformatics tools could find new biomarkers and
	establish patterns with high sensitivity and specificity for the
	detection of ovarian cancer.},
  doi = {10.1631/jzus.2005.B0227},
  pdf = {../local/Yu2005integrated.pdf},
  file = {Yu2005integrated.pdf:local/Yu2005integrated.pdf:PDF},
  keywords = {biosvm},
  url = {http://dx.doi.org/10.1631/jzus.2005.B0227}
}
@article{Yu2002Methods,
  author = {Kun Yu and Nikolai Petrovsky and Christian Schönbach and Judice
	Y L Koh and Vladimir Brusic},
  title = {Methods for prediction of peptide binding to {MHC} molecules: a comparative
	study.},
  journal = {Mol Med},
  year = {2002},
  volume = {8},
  pages = {137--148},
  number = {3},
  month = {Mar},
  abstract = {BACKGROUND: A variety of methods for prediction of peptide binding
	to major histocompatibility complex (MHC) have been proposed. These
	methods are based on binding motifs, binding matrices, hidden Markov
	models (HMM), or artificial neural networks (ANN). There has been
	little prior work on the comparative analysis of these methods. MATERIALS
	AND METHODS: We performed a comparison of the performance of six
	methods applied to the prediction of two human MHC class I molecules,
	including binding matrices and motifs, ANNs, and HMMs. RESULTS: The
	selection of the optimal prediction method depends on the amount
	of available data (the number of peptides of known binding affinity
	to the MHC molecule of interest), the biases in the data set and
	the intended purpose of the prediction (screening of a single protein
	versus mass screening). When little or no peptide data are available,
	binding motifs are the most useful alternative to random guessing
	or use of a complete overlapping set of peptides for selection of
	candidate binders. As the number of known peptide binders increases,
	binding matrices and HMM become more useful predictors. ANN and HMM
	are the predictive methods of choice for MHC alleles with more than
	100 known binding peptides. CONCLUSION: The ability of bioinformatic
	methods to reliably predict MHC binding peptides, and thereby potential
	T-cell epitopes, has major implications for clinical immunology,
	particularly in the area of vaccine design.},
  keywords = {Amino Acid Motifs; Computational Biology; Histocompatibility Antigens
	Class I; Humans; Models, Molecular; Peptides; Protein Binding},
  owner = {laurent},
  pii = {S152836580230137X},
  pmid = {12142545},
  timestamp = {2007.01.27}
}
@article{Yuan2002Prediction,
  author = {Yuan, Z. and Burrage, K. and Mattick, J.S.},
  title = {Prediction of protein solvent accessibility using support vector
	machines},
  journal = {Proteins},
  year = {2002},
  volume = {48},
  pages = {566-570},
  number = {3},
  abstract = {A {S}upport {V}ector {M}achine learning system has been trained to
	predict protein solvent accessibility from the primary structure.
	{D}ifferent kernel functions and sliding window sizes have been explored
	to find how they affect the prediction performance. {U}sing a cut-off
	threshold of 15% that splits the dataset evenly (an equal number
	of exposed and buried residues), this method was able to achieve
	a prediction accuracy of 70.1% for single sequence input and 73.9%
	for multiple alignment sequence input, respectively. {T}he prediction
	of three and more states of solvent accessibility was also studied
	and compared with other methods. {T}he prediction accuracies are
	better than, or comparable to, those obtained by other methods such
	as neural networks, {B}ayesian classification, multiple linear regression,
	and information theory. {I}n addition, our results further suggest
	that this system may be combined with other prediction methods to
	achieve more reliable results, and that the {S}upport {V}ector {M}achine
	method is a very useful tool for biological sequence analysis.},
  doi = {10.1002/prot.10176},
  pdf = {../local/Yuan2002Prediction.pdf},
  file = {Yuan2002Prediction.pdf:local/Yuan2002Prediction.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/prot.10176}
}
@article{Yuan2004SVMtm,
  author = {Yuan, Z. and Mattick, J.S. and Teasdale, R.D.},
  title = {{{SVM}tm}: support vector machines to predict transmembrane segments.},
  journal = {J. {C}omput. {C}hem.},
  year = {2004},
  volume = {25},
  pages = {632},
  number = {5},
  month = {6},
  abstract = {A new method has been developed for prediction of transmembrane helices
	using support vector machines. {D}ifferent coding schemes of protein
	sequences were explored, and their performances were assessed by
	crossvalidation tests. {T}he best performance method can predict
	the transmembrane helices with sensitivity of 93.4% and precision
	of 92.0%. {F}or each predicted transmembrane segment, a score is
	given to show the strength of transmembrane signal and the prediction
	reliability. {I}n particular, this method can distinguish transmembrane
	proteins from soluble proteins with an accuracy of approximately
	99%. {T}his method can be used to complement current transmembrane
	helix prediction methods and can be used for consensus analysis of
	entire proteomes. {T}he predictor is located at http://genet.imb.uq.edu.au/predictors/{SVM}tm.},
  doi = {10.1002/jcc.10411},
  pdf = {../local/Yuan2004SVMtm.pdf},
  file = {Yuan2004SVMtm.pdf:local/Yuan2004SVMtm.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1002/jcc.10411}
}
@article{Zaki2005Application,
  author = {Zaki, N. M. and Deris, S. and Illias, R.},
  title = {Application of string kernels in protein sequence classification.},
  journal = {Appl. {B}ioinformatics},
  year = {2005},
  volume = {4},
  pages = {45-52},
  number = {1},
  abstract = {I{NTRODUCTION}: {T}he production of biological information has become
	much greater than its consumption. {T}he key issue now is how to
	organise and manage the huge amount of novel information to facilitate
	access to this useful and important biological information. {O}ne
	core problem in classifying biological information is the annotation
	of new protein sequences with structural and functional features.
	{METHOD}: {T}his article introduces the application of string kernels
	in classifying protein sequences into homogeneous families. {A} string
	kernel approach used in conjunction with support vector machines
	has been shown to achieve good performance in text categorisation
	tasks. {W}e evaluated and analysed the performance of this approach,
	and we present experimental results on three selected families from
	the {SCOP} ({S}tructural {C}lassification of {P}roteins) database.
	{W}e then compared the overall performance of this method with the
	existing protein classification methods on benchmark {SCOP} datasets.
	{RESULTS}: {A}ccording to the {F}1 performance measure and the rate
	of false positive ({RFP}) measure, the string kernel method performs
	well in classifying protein sequences. {T}he method outperformed
	all the generative-based methods and is comparable with the {SVM}-{F}isher
	method. {DISCUSSION}: {A}lthough the string kernel approach makes
	no use of prior biological knowledge, it still captures sufficient
	biological information to enable it to outperform some of the state-of-the-art
	methods.},
  keywords = {biosvm},
  pii = {415}
}
@article{Zavaljevski2002Support,
  author = {Zavaljevski, N. and Stevens, F.J. and Reifman, J.},
  title = {Support vector machines with selective kernel scaling for protein
	classification and identification of key amino acid positions },
  journal = {Bioinformatics},
  year = {2002},
  volume = {18},
  pages = {689--696},
  number = {5},
  abstract = {Motivation: {D}ata that characterize primary and tertiary structures
	of proteins are now accumulating at a rapid and accelerating rate
	and require automated computational tools to extract critical information
	relating amino acid changes with the spectrum of functionally attributes
	exhibited by a protein. {W}e propose that immunoglobulin-type beta-domains,
	which are found in approximate 400 functionally distinct forms in
	humans alone, provide the immense genetic variation within limited
	conformational changes that might facilitate the development of new
	computational tools. {A}s an initial step, we describe here an approach
	based on {S}upport {V}ector {M}achine ({SVM}) technology to identify
	amino acid variations that contribute to the functional attribute
	of pathological self-assembly by some human antibody light chains
	produced during plasma cell diseases. {R}esults: {W}e demonstrate
	that {SVM}s with selective kernel scaling are an effective tool in
	discriminating between benign and pathologic human immunoglobulin
	light chains. {I}nitial results compare favorably against manual
	classification performed by experts and indicate the capability of
	{SVM}s to capture the underlying structure of the data. {T}he data
	set consists of 70 proteins of human antibody 1 light chains, each
	represented by aligned sequences of 120 amino acids. {W}e perform
	feature selection based on a first-order adaptive scaling algorithm,
	which confirms the importance of changes in certain amino acid positions
	and identifies other positions that are key in the characterization
	of protein function.},
  pdf = {../local/zava02.pdf},
  file = {zava02.pdf:local/zava02.pdf:PDF},
  keywords = {biosvm},
  subject = {biokernel},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/18/5/689}
}
@article{Zernov2003Drug,
  author = {V. V. Zernov and K. V. Balakin and A. A. Ivaschenko and N. P. Savchuk
	and I. V. Pletnev},
  title = {Drug discovery using support vector machines. {T}he case studies
	of drug-likeness, agrochemical-likeness, and enzyme inhibition predictions.},
  journal = {J {C}hem {I}nf {C}omput {S}ci},
  year = {2003},
  volume = {43},
  pages = {2048-56},
  number = {6},
  abstract = {Support {V}ector {M}achines ({SVM}) is a powerful classification and
	regression tool that is becoming increasingly popular in various
	machine learning applications. {W}e tested the ability of {SVM},
	in comparison with well-known neural network techniques, to predict
	drug-likeness and agrochemical-likeness for large compound collections.
	{F}or both kinds of data, {SVM} outperforms various neural networks
	using the same set of descriptors. {W}e also used {SVM} for estimating
	the activity of {C}arbonic {A}nhydrase {II} ({CA} {II}) enzyme inhibitors
	and found that the prediction quality of our {SVM} model is better
	than that reported earlier for conventional {QSAR}. {M}odel characteristics
	and data set features were studied in detail.},
  doi = {10.1021/ci0340916},
  pdf = {../local/Zernov2003Drug.pdf},
  file = {Zernov2003Drug.pdf:local/Zernov2003Drug.pdf:PDF},
  keywords = {biosvm chemoinformatics},
  url = {http://dx.doi.org/10.1021/ci0340916}
}
@article{Zhang2005MULTIPRED,
  author = {Zhang, G. L. and Khan, A. M. and Srinivasan, K. N. and August, J.
	T. and Brusic, V.},
  title = {{MULTIPRED}: a computational system for prediction of promiscuous
	{HLA} binding peptides.},
  journal = {Nucleic Acids Res/},
  year = {2005},
  volume = {33},
  pages = {W172--W179},
  number = {Web Server issue},
  month = {Jul},
  abstract = {MULTIPRED is a web-based computational system for the prediction of
	peptide binding to multiple molecules (proteins) belonging to human
	leukocyte antigens (HLA) class I A2, A3 and class II DR supertypes.
	It uses hidden Markov models and artificial neural network methods
	as predictive engines. A novel data representation method enables
	MULTIPRED to predict peptides that promiscuously bind multiple HLA
	alleles within one HLA supertype. Extensive testing was performed
	for validation of the prediction models. Testing results show that
	MULTIPRED is both sensitive and specific and it has good predictive
	ability (area under the receiver operating characteristic curve A(ROC)
	> 0.80). MULTIPRED can be used for the mapping of promiscuous T-cell
	epitopes as well as the regions of high concentration of these targets--termed
	T-cell epitope hotspots. MULTIPRED is available at http://antigen.i2r.a-star.edu.sg/multipred/.},
  doi = {10.1093/nar/gki452},
  keywords = {Algorithms, Amino Acid Sequence, Antigen-Antibody Complex, Automated,
	Binding Sites, Computational Biology, Drug Delivery Systems, Drug
	Design, Epitopes, HLA Antigens, HLA-A Antigens, HLA-DR Antigens,
	Humans, Internet, Markov Chains, Molecular Sequence Data, Neural
	Networks (Computer), Pattern Recognition, Peptides, Protein, Protein
	Binding, Protein Interaction Mapping, Sequence Analysis, Software,
	T-Lymphocyte, User-Computer Interface, Viral Vaccines, 15980449},
  pii = {33/suppl_2/W172},
  pmid = {15980449},
  timestamp = {2007.01.25},
  url = {http://dx.doi.org/10.1093/nar/gki452}
}
@article{Zhang2005Improved,
  author = {Qidong Zhang and Sukjoon Yoon and William J Welsh},
  title = {Improved method for predicting beta-turn using support vector machine.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {2370-4},
  number = {10},
  month = {May},
  abstract = {M{OTIVATION}: {N}umerous methods for predicting beta-turns in proteins
	have been developed based on various computational schemes. {H}ere,
	we introduce a new method of beta-turn prediction that uses the support
	vector machine ({SVM}) algorithm together with predicted secondary
	structure information. {V}arious parameters from the {SVM} have been
	adjusted to achieve optimal prediction performance. {RESULTS}: {T}he
	{SVM} method achieved excellent performance as measured by the {M}atthews
	correlation coefficient ({MCC} = 0.45) using a 7-fold cross validation
	on a database of 426 non-homologous protein chains. {T}o our best
	knowledge, this {MCC} value is the highest achieved so far for predicting
	beta-turn. {T}he overall prediction accuracy {Q}total was 77.3\%,
	which is the best among the existing prediction methods. {A}mong
	its unique attractive features, the present {SVM} method avoids overtraining
	and compresses information and provides a predicted reliability index.},
  doi = {10.1093/bioinformatics/bti358},
  pdf = {../local/Zhang2005Improved.pdf},
  file = {Zhang2005Improved.pdf:local/Zhang2005Improved.pdf:PDF},
  keywords = {biosvm},
  pii = {bti358},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti358}
}
@article{Zhang2003Classification,
  author = {Zhang, S.-W. and Pan, Q. and Zhang, H.-C. and Zhang, Y-L. and Wang,
	H.-Y.},
  title = {Classification of protein quaternary structure with support vector
	machine},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {2390-2396},
  number = {18},
  abstract = {Motivation: {S}ince the gap between sharply increasing known sequences
	and slow accumulation of known structures is becoming large, an automatic
	classification process based on the primary sequences and known three-dimensional
	structure becomes indispensable. {T}he classification of protein
	quaternary structure based on the primary sequences can provide some
	useful information for the biologists. {S}o a fully automatic and
	reliable classification system is needed. {T}his work tries to look
	for the effective methods of extracting attribute and the algorithm
	for classifying the quaternary structure from the primary sequences.
	{R}esults: {B}oth of the support vector machine ({SVM}) and the covariant
	discriminant algorithms have been first introduced to predict quaternary
	structure properties from the protein primary sequences. {T}he amino
	acid composition and the auto-correlation functions based on the
	amino acid index profile of the primary sequence have been taken
	into account in the algorithms. {W}e have analyzed 472 amino acid
	indices and selected the four amino acid indices as the examples,
	which have the best performance. {T}hus the five attribute parameter
	data sets ({COMP}, {FASG}, {NISK}, {WOLS} and {KYTJ}) were established
	from the protein primary sequences. {T}he {COMP} attribute data set
	is composed of amino acid composition, and the {FASG}, {NISK}, {WOLS}
	and {KYTJ} attribute data sets are composed of the amino acid composition
	and the auto-correlation functions of the corresponding amino acid
	residue index. {T}he overall accuracies of {SVM} are 78.5, 87.5,
	83.2, 81.7 and 81.9%, respectively, for {COMP}, {FASG}, {NISK}, {WOLS}
	and {KYTJ} data sets in jackknife test, which are 19.6, 7.8, 15.5,
	13.1 and 15.8%, respectively, higher than that of the covariant discriminant
	algorithm in the same test. {T}he results show that {SVM} may be
	applied to discriminate between the primary sequences of homodimers
	and non-homodimers and the two protein sequence descriptors can reflect
	the quaternary structure information. {C}ompared with previous {R}obert
	{G}arian's investigation, the performance of {SVM} is almost equal
	to that of the {D}ecision tree models, and the methods of extracting
	feature vector from the primary sequences are superior to {R}obert's
	binning function method. {A}vailability: {P}rograms are available
	on request from the authors.},
  pdf = {../local/Zhang2003Classification.pdf},
  file = {Zhang2003Classification.pdf:local/Zhang2003Classification.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/18/2390}
}
@article{Zhang2003Sequence,
  author = {Zhang, X. H-F. and Heller, K. A. and Hefter, I. and Leslie, C. S.
	and Chasin, L. A.},
  title = {Sequence {I}nformation for the {S}plicing of {H}uman {P}re-m{RNA}
	{I}dentified by {S}upport {V}ector {M}achine {C}lassification},
  journal = {Genome {R}es.},
  year = {2003},
  volume = {13},
  pages = {2637-2650},
  number = {12},
  abstract = {Vertebrate pre-m{RNA} transcripts contain many sequences that resemble
	splice sites on the basis of agreement to the consensus, yet these
	more numerous false splice sites are usually completely ignored by
	the cellular splicing machinery. {E}ven at the level of exon definition,
	pseudo exons defined by such false splices sites outnumber real exons
	by an order of magnitude. {W}e used a support vector machine to discover
	sequence information that could be used to distinguish real exons
	from pseudo exons. {T}his machine learning tool led to the definition
	of potential branch points, an extended polypyrimidine tract, and
	{C}-rich and {TG}-rich motifs in a region limited to 50 nt upstream
	of constitutively spliced exons. {C}-rich sequences were also found
	in a region extending to 80 nt downstream of exons, along with {G}-triplet
	motifs. {I}n addition, it was shown that combinations of three bases
	within the splice donor consensus sequence were more effective than
	consensus values in distinguishing real from pseudo splice sites;
	two-way base combinations were optimal for distinguishing 3' splice
	sites. {T}hese data also suggest that interactions between two or
	more of these elements may contribute to exon recognition, and provide
	candidate sequences for assessment as intronic splicing enhancers.},
  doi = {10.1101/gr.1679003},
  pdf = {../local/Zhang2003Sequence.pdf},
  file = {Zhang2003Sequence.pdf:local/Zhang2003Sequence.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://www.genome.org/cgi/content/abstract/13/12/2637}
}
@article{Zhang2005Descriptor-based,
  author = {Zhang, Z. and Kochhar, S. and Grigorov, M. G.},
  title = {Descriptor-based protein remote homology identification.},
  journal = {Protein {S}ci.},
  year = {2005},
  volume = {42},
  pages = {431-444},
  number = {2},
  abstract = {Here, we report a novel protein sequence descriptor-based remote homology
	identification method, able to infer fold relationships without the
	explicit knowledge of structure. {I}n a first phase, we have individually
	benchmarked 13 different descriptor types in fold identification
	experiments in a highly diverse set of protein sequences. {T}he relevant
	descriptors were related to the fold class membership by using simple
	similarity measures in the descriptor spaces, such as the cosine
	angle. {O}ur results revealed that the three best-performing sets
	of descriptors were the sequence-alignment-based descriptor using
	{PSI}-{BLAST} e-values, the descriptors based on the alignment of
	secondary structural elements ({SSEA}), and the descriptors based
	on the occurrence of {PROSITE} functional motifs. {I}n a second phase,
	the three top-performing descriptors were combined to obtain a final
	method with improved performance, which we named {D}esc{F}old. {C}lass
	membership was predicted by {S}upport {V}ector {M}achine ({SVM})
	learning. {I}n comparison with the individual {PSI}-{BLAST}-based
	descriptor, the rate of remote homology identification increased
	from 33.7% to 46.3%. {W}e found out that the composite set of descriptors
	was able to identify the true remote homolog for nearly every sixth
	sequence at the 95% confidence level, or some 10% more than a single
	{PSI}-{BLAST} search. {W}e have benchmarked the {D}esc{F}old method
	against several other state-of-the-art fold recognition algorithms
	for the 172 {L}ive{B}ench-8 targets, and we concluded that it was
	able to add value to the existing techniques by providing a confident
	hit for at least 10% of the sequences not identifiable by the previously
	known methods.},
  doi = {10.1110/ps.041035505},
  pdf = {../local/Zhang2005Descriptor-based.pdf},
  file = {Zhang2005Descriptor-based.pdf:local/Zhang2005Descriptor-based.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://dx.doi.org/10.1110/ps.041035505}
}
@article{Zhao2003Application,
  author = {Zhao, Y. and Pinilla, C. and Valmori, D. and Martin, R. and Simon,
	R.},
  title = {Application of support vector machines for {T}-cell epitopes prediction},
  journal = {Bioinformatics},
  year = {2003},
  volume = {19},
  pages = {1978-1984},
  number = {15},
  abstract = {Motivation: {T}he {T}-cell receptor, a major histocompatibility complex
	({MHC}) molecule, and a bound antigenic peptide, play major roles
	in the process of antigen-specific {T}-cell activation. {T}-cell
	recognition was long considered exquisitely specific. {R}ecent data
	also indicate that it is highly flexible, and one receptor may recognize
	thousands of different peptides. {D}eciphering the patterns of peptides
	that elicit a {MHC} restricted {T}-cell response is critical for
	vaccine development. {R}esults: {F}or the first time we develop a
	support vector machine ({SVM}) for {T}-cell epitope prediction with
	an {MHC} type {I} restricted {T}-cell clone. {U}sing cross-validation,
	we demonstrate that {SVM}s can be trained on relatively small data
	sets to provide prediction more accurate than those based on previously
	published methods or on {MHC} binding. {S}upplementary information:
	{D}ata for 203 synthesized peptides is available at http://linus.nci.nih.gov/{D}ata/{LAU}203_{P}eptide.pdf},
  pdf = {../local/Zhao2003Application.pdf},
  file = {Zhao2003Application.pdf:local/Zhao2003Application.pdf:PDF},
  keywords = {biosvm immunoinformatics},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/19/15/1978}
}
@article{Zhou2005Recognition,
  author = {GuoDong Zhou and Dan Shen and Jie Zhang and Jian Su and SoonHeng
	Tan},
  title = {Recognition of protein/gene names from text using an ensemble of
	classifiers.},
  journal = {B{MC} {B}ioinformatics},
  year = {2005},
  volume = {6 Suppl 1},
  pages = {S7},
  abstract = {This paper proposes an ensemble of classifiers for biomedical name
	recognition in which three classifiers, one {S}upport {V}ector {M}achine
	and two discriminative {H}idden {M}arkov {M}odels, are combined effectively
	using a simple majority voting strategy. {I}n addition, we incorporate
	three post-processing modules, including an abbreviation resolution
	module, a protein/gene name refinement module and a simple dictionary
	matching module, into the system to further improve the performance.
	{E}valuation shows that our system achieves the best performance
	from among 10 systems with a balanced {F}-measure of 82.58 on the
	closed evaluation of the {B}io{C}reative protein/gene name recognition
	task ({T}ask 1{A}).},
  doi = {10.1186/1471-2105-6-S1-S7},
  pdf = {../local/Zhou2005Recognition.pdf},
  file = {Zhou2005Recognition.pdf:local/Zhou2005Recognition.pdf:PDF},
  keywords = {biosvm nlp},
  pii = {1471-2105-6-S1-S7},
  url = {http://dx.doi.org/10.1186/1471-2105-6-S1-S7}
}
@article{Zhou2005LS,
  author = {Xin Zhou and K. Z. Mao},
  title = {L{S} {B}ound based gene selection for {DNA} microarray data.},
  journal = {Bioinformatics},
  year = {2005},
  volume = {21},
  pages = {1559-64},
  number = {8},
  month = {Apr},
  abstract = {M{OTIVATION}: {O}ne problem with discriminant analysis of {DNA} microarray
	data is that each sample is represented by quite a large number of
	genes, and many of them are irrelevant, insignificant or redundant
	to the discriminant problem at hand. {M}ethods for selecting important
	genes are, therefore, of much significance in microarray data analysis.
	{I}n the present study, a new criterion, called {LS} {B}ound measure,
	is proposed to address the gene selection problem. {T}he {LS} {B}ound
	measure is derived from leave-one-out procedure of {LS}-{SVM}s (least
	squares support vector machines), and as the upper bound for leave-one-out
	classification results it reflects to some extent the generalization
	performance of gene subsets. {RESULTS}: {W}e applied this {LS} {B}ound
	measure for gene selection on two benchmark microarray datasets:
	colon cancer and leukemia. {W}e also compared the {LS} {B}ound measure
	with other evaluation criteria, including the well-known {F}isher's
	ratio and {M}ahalanobis class separability measure, and other published
	gene selection algorithms, including {W}eighting factor and {SVM}
	{R}ecursive {F}eature {E}limination. {T}he strength of the {LS} {B}ound
	measure is that it provides gene subsets leading to more accurate
	classification results than the filter method while its computational
	complexity is at the level of the filter method. {AVAILABILITY}:
	{A} companion website can be accessed at http://www.ntu.edu.sg/home5/pg02776030/lsbound/.
	{T}he website contains: (1) the source code of the gene selection
	algorithm; (2) the complete set of tables and figures regarding the
	experimental study; (3) proof of the inequality (9). {CONTACT}: ekzmao@ntu.edu.sg.},
  doi = {10.1093/bioinformatics/bti216},
  pdf = {../local/Zhou2005LS.pdf},
  file = {Zhou2005LS.pdf:local/Zhou2005LS.pdf:PDF},
  keywords = {biosvm featureselection microarray},
  pii = {bti216},
  url = {http://dx.doi.org/10.1093/bioinformatics/bti216}
}
@article{Zhu2003Introduction,
  author = {Lingyun Zhu and Baoming Wu and Changxiu Cao},
  title = {Introduction to medical data mining},
  journal = {Sheng {W}u {Y}i {X}ue {G}ong {C}heng {X}ue {Z}a {Z}hi},
  year = {2003},
  volume = {20},
  pages = {559-62},
  number = {3},
  month = {Sep},
  abstract = {Modern medicine generates a great deal of information stored in the
	medical database. {E}xtracting useful knowledge and providing scientific
	decision-making for the diagnosis and treatment of disease from the
	database increasingly becomes necessary. {D}ata mining in medicine
	can deal with this problem. {I}t can also improve the management
	level of hospital information and promote the development of telemedicine
	and community medicine. {B}ecause the medical information is characteristic
	of redundancy, multi-attribution, incompletion and closely related
	with time, medical data mining differs from other one. {I}n this
	paper we have discussed the key techniques of medical data mining
	involving pretreatment of medical data, fusion of different pattern
	and resource, fast and robust mining algorithms and reliability of
	mining results. {T}he methods and applications of medical data mining
	based on computation intelligence such as artificial neural network,
	fuzzy system, evolutionary algorithms, rough set, and support vector
	machine have been introduced. {T}he features and problems in data
	mining are summarized in the last section.},
  keywords = {Algorithms, Anion Exchange Resins, Automatic Data Processing, Chemical,
	Chromatography, Computational Biology, Computer-Assisted, Data Interpretation,
	Databases, Decision Making, Decision Trees, English Abstract, Factual,
	Fuzzy Logic, Humans, Indicators and Reagents, Information Storage
	and Retrieval, Ion Exchange, Models, Neural Networks (Computer),
	Non-P.H.S., Non-U.S. Gov't, Nucleic Acid Conformation, P.H.S., Proteins,
	Quantitative Structure-Activity Relationship, RNA, ROC Curve, Research
	Support, Sequence Analysis, Statistical, Transfer, U.S. Gov't, 14565039}
}
@article{Zien2000Engineering,
  author = {Zien, A. and R{\"a}tsch, G. and Mika, S. and Sch{\"o}lkopf, B. and
	Lengauer, T. and M{\"u}ller, K.-R.},
  title = {Engineering support vector machine kernels that recognize translation
	initiation sites},
  journal = {Bioinformatics},
  year = {2000},
  volume = {16},
  pages = {799-807},
  number = {9},
  abstract = {Motivation: {I}n order to extract protein sequences from nucleotide
	sequences, it is an important step to recognize points at which regions
	start that code for proteins. {T}hese points are called translation
	initiation sites ({TIS}). {R}esults: {T}he task of finding {TIS}
	can be modeled as a classification problem. {W}e demonstrate the
	applicability of support vector machines for this task, and show
	how to incorporate prior biological knowledge by engineering an appropriate
	kernel function. {W}ith the described techniques the recognition
	performance can be improved by 26% over leading existing approaches.
	{W}e provide evidence that existing related methods (e.g. {ESTS}can)
	could profit from advanced {TIS} recognition.},
  pdf = {../local/Zien2000Engineering.pdf},
  file = {Zien2000Engineering.pdf:local/Zien2000Engineering.pdf:PDF},
  keywords = {biosvm},
  owner = {jeanphilippevert},
  url = {http://bioinformatics.oupjournals.org/cgi/content/abstract/16/9/799}
}
@comment{{jabref-meta: selector_author:}}
@comment{{jabref-meta: selector_journal:Adv. Drug Deliv. Rev.;Am. J. Hu
m. Genet.;Am. J. Pathol.;Ann. Appl. Stat.;Ann. Math. Statist.;Ann. N. 
Y. Acad. Sci.;Ann. Probab.;Ann. Stat.;Artif. Intell. Med.;Bernoulli;Bi
ochim. Biophys. Acta;Bioinformatics;Biometrika;BMC Bioinformatics;Br. 
J. Pharmacol.;Breast Cancer Res.;Cell;Cell. Signal.;Chem. Res. Toxicol
.;Clin. Cancer Res.;Combinator. Probab. Comput.;Comm. Pure Appl. Math.
;Comput. Chem.;Comput. Comm. Rev.;Comput. Stat. Data An.;Curr. Genom.;
Curr. Opin. Chem. Biol.;Curr. Opin. Drug Discov. Devel.;Data Min. Know
l. Discov.;Electron. J. Statist.;Eur. J. Hum. Genet.;FEBS Lett.;Found.
 Comput. Math.;Genome Biol.;IEEE T. Neural Networ.;IEEE T. Pattern. An
al.;IEEE T. Signal. Proces.;IEEE Trans. Inform. Theory;IEEE Trans. Kno
wl. Data Eng.;IEEE/ACM Trans. Comput. Biol. Bioinf.;Int. J. Comput. Vi
sion;Int. J. Data Min. Bioinform.;Int. J. Qantum Chem.;J Biol Syst;J. 
ACM;J. Am. Soc. Inf. Sci. Technol.;J. Am. Stat. Assoc.;J. Bioinform. C
omput. Biol.;J. Biol. Chem.;J. Biomed. Inform.;J. Cell. Biochem.;J. Ch
em. Inf. Comput. Sci.;J. Chem. Inf. Model.;J. Clin. Oncol.;J. Comput. 
Biol.;J. Comput. Graph. Stat.;J. Eur. Math. Soc.;J. Intell. Inform. Sy
st.;J. Mach. Learn. Res.;J. Med. Chem.;J. Mol. BIol.;J. R. Stat. Soc. 
Ser. B;Journal of Statistical Planning and Inference;Mach. Learn.;Math
. Program.;Meth. Enzymol.;Mol. Biol. Cell;Mol. Biol. Evol.;Mol. Cell. 
Biol.;Mol. Syst. Biol.;N. Engl. J. Med.;Nat. Biotechnol.;Nat. Genet.;N
at. Med.;Nat. Methods;Nat. Rev. Cancer;Nat. Rev. Drug Discov.;Nat. Rev
. Genet.;Nature;Neural Comput.;Neural Network.;Neurocomputing;Nucleic 
Acids Res.;Pattern Anal. Appl.;Pattern Recognit.;Phys. Rev. E;Phys. Re
v. Lett.;PLoS Biology;PLoS Comput. Biol.;Probab. Theory Relat. Fields;
Proc. IEEE;Proc. Natl. Acad. Sci. USA;Protein Eng.;Protein Eng. Des. S
el.;Protein Sci.;Protein. Struct. Funct. Genet.;Random Struct. Algorit
hm.;Rev. Mod. Phys.;Science;Stat. Probab. Lett.;Statistica Sinica;Theo
r. Comput. Sci.;Trans. Am. Math. Soc.;Trends Genet.;}}
@comment{{jabref-meta: selector_keywords:biogm;biosvm;breastcancer;cgh;
chemogenomics;chemoinformatics;csbcbook;csbcbook-ch1;csbcbook-ch2;csbc
book-ch3;csbcbook-ch4;csbcbook-ch5;csbcbook-ch6;csbcbook-ch7;csbcbook-
ch8;csbcbook-ch9;csbcbook-mustread;dimred;featureselection;glycans;her
g;hic;highcontentscreening;image;immunoinformatics;kernel-theory;kerne
lbook;lasso;microarray;ngs;nlp;plasmodium;proteomics;PUlearning;rnaseq
;segmentation;sirna;}}
@comment{{jabref-meta: selector_booktitle:Adv. Neural. Inform. Process 
Syst.;}}

This file was generated by bibtex2html 1.97.