@string{cabios = {Computer Applications in the Biosciences}} @string{PNAS = "Proceedings of the National Academy of Sciences"} @InBook{dhae98, author = {Patrik D'haeseleer and Xiling Wen and Stefanie Fuhrman and Roland Somogyi}, title = {Information Processing in Cells and Tissues}, chapter = {Mining the gene expression matrix: Inferring gene relationships from large scale gene expression data}, publisher = {Plenum}, year = {1998}, annote = { This paper seems to focus specifically on a particular gene expression matrix from previously published work. The matrix provides information on 112 genes at nines stages during development of the rat cervical spinal cord.\\ Establishes relationships among genes by linear correlation and rank correlation. \\ Available on the web at: \\ \url{http\\:www.cs.unm.edu/~patrik/networks/IPCAT/ipcat.html} } } @MastersThesis{duti99, author = {Bas Dutilh}, title = {Analysis of data from microarray experiments: The state of the art in gene network reconstruction}, school = {Utrecht University}, year = {1999}, annote = { A summary of the literature; the title tells all. This is two years old so probably somewhat outdated. Provides background on details of the theory behind microarrays. Also gives the theory behind quantification of expression levels. \\ The focus is on reconstruction of gene networks. Includes discussion of a graph theoretic clustering algorithm. Discusses general properties of gene networks and how they are modeled. Includes a section on promoter analysis. Available on the web at: \\ \url{http\\:www-binf.bio.uu.nl/~dutilh/gene-networks/thesis.html} } } @Article{shar2, author = {R. Sharan and R. Shamir}, title = {{CLICK}: a clustering algorithm with applications to gene expression analysis }, journal = {Proc Int Conf Intell Syst Mol Biol}, year = {2}, volume = {8}, pages = {37-316}, annote = { I do not have a copy of this but want to read it later. Ron Shamir is very big in applications of graph theory to molecular biology. } } @Article{sher2, author = {Gavin Sherlock}, title = {Analysis of large-scale gene expression data}, journal = {Current Opinion in Immunology}, year = {2}, volume = {12}, pages = {21-25}, annote = { A review article. Discusses several methods of cluster analysis including hierarchical clustering, divisive clustering, self-organizing maps and k-means clustering.\\ Also discusses correlation of expression data with other information. Contains a useful annotated bibliography. } } @Article{gaas2, author = {Terry Gaasterland and Stefan Bekiranov}, title = {Making the most of microarray data}, journal = {Nature Genetics}, year = {2}, volume = {24}, pages = {24-26}, annote = { A brief overview of supervised learning using support vector machines. References a recent research article on support vector machines. Also discusses a software package that uses a support vector machine to classify microarray data. Almost a news article. } } @Article{toro99, author = {Petri T\"{o}r\"{o}nen and Mikko Kolehmainen and Garry Wong and Eero Castren\'{e}n}, title = {Analysis of gene expression data using self-organizing maps}, journal = {FEBS Letters}, year = {1999}, volume = {451}, pages = {142-146}, annote = { {"}A self-organizing map {(SOM)} is an unsupervised neural network learning algorithm...{"}. The paper uses {SOM} to classify published data of yeast gene expression. The authors are looking at all yeast genes at once. They observe the changes in gene expression induced by several familiar metabolic conditions; including diauxic shift, sporulation and cell-cycle.\\ {SOM} has been used in several different fields to analyze and organize very large data sets. The authors investigate the usefulness of {SOM} with gene expression data. They analyze publicly available data and conclude that {SOM} reliably clusters similar genes.\\ {SOM} works best with very large data sets. } } @Article{eise98, author = {Michael B. Eisen and Paul T. Spellman and Patrick O. Brown and David Botstein}, title = {Cluster analysis and display of genome-wide expression patterns}, journal = PNAS, year = {1998}, volume = {95}, pages = {14863-14868}, annote = { Discusses cluster analysis algorithm that classifies genes based on similarity in pattern of gene expression. This does not mean that genes classified as being close really have similar function. The idea is that genes that are activated in response to the same stimulus are classified together. This sounds stupid at first but it really makes sense as a first approximation. An analogy would be to think of a Martian who has been assigned the task of classifying three unknown human artifacts; a tire, a steering wheel. and a nickel. At first the Martian might classify all three together because they are all disk shaped. On closer study she might classify the steering wheel and tire together based on size. Later she might even realize that a tire and steering wheel are both car parts and classify them together that way. Classifying genes probably works in a similar way. We know that genes A and B are both activated in response to heat stress; later we will discover that A codes for histone protein and B codes for telomerease. \\ The clustering algorithm follows that of Sokol and Michener; published in 1958. The idea is to take a large population of genes. Find the two genes with most similar expression profiles and average their profile. This average then replaces the two similar genes and is weighted more heavily accordingly. The process is repeated until only a single node remains. The result is a dendogram that classifies all genes by expression pattern.\\ The paper does not provide an overview of clustering methods. The paper presents a method of visualizing the resulting classification data using color. Large quantities of information can be taken in at a glance when the data is in the form of color. If a viewer was to glance at a table of numbers containing exactly the same information the information content would be quite low. \\ Eisen provides tons of software at his website. \\ \url{http://mcb.berkeley.edu/faculty/GEN/eisenm.html} } } @Article{getz2, author = {Gad Getz and Erel Levine and Eytan Domany}, title = {Coupled two-way clustering analysis of gene microarray data}, journal = PNAS, year = {2}, volume = {97}, pages = {1279-1284}, annote = { Proposes the idea of clustering gene expression data in two ways. First compounds are clustered according to their effect on genes and then genes are clustered into groups that have similar expression profiles. Previous authors have performed this sort of two-way analysis. The innovation of this paper is the coupling. The result is to focus attention on a small subset of the available data. Any clustering algorithm can be used with coupled two-way clustering {(CTWC)}. Introduces a clustering algorithm called \emph{superparamagnetic} clustering which the authors claim is {"}especially suitable{"} for microarray data analyis. Applies their technique to two cell types; colon cancer and leukemia. } } @Article{dhae99, author = {Patrik D'haeseleer and Xiling Wen and Stefanie Fuhrman and Roland Somogyi}, title = {Linear Modeling of m{RNA} expression levels during {CNS} development and injury}, journal = {Pacific Symposium on Biocomputing '99}, year = {1999}, pages = {41-52}, annote = { Like the other work by the same authors this is specialized to rat {CNS}. Provides the details of the linear model. \\ Available on the web at: \\ \url{http\\:www.cs.unm.edu/~patrik/networks/PSB99/linmod.html} } } @book{beebe.pell.seasholtz:chemometrics, author = {K. Beebe and R. Pell and M. Seasholtz}, title = {Chemometrics: A Practical Guide}, publisher = {John Wiley and Sons, Inc.}, address = {New York, New York}, year = {1998}, annote = {This book contains information on the use of mathmematics with chemical anaylsis. It includes chapters on defining the problem, preprocessing data, pattern recognition, and prediction. Within the pattern recognition chapters information is given on cluster analysis, supervised/unsupervised learning, and nearest neighbor analysis. } } @book{otto:chemometrics, author = {Matthias Otto}, title = {Chemometrics: Statistics and Computer Application in Analytical Chemistry}, publisher = {WILEY-VCH}, address = {Hemsbach, Germany}, year = {1999}, annote = {This book is very current with material on basic statistics and pattern recognition. It's dedicated to the evalutation of experimental observations and not to theoritical aspects. A very indepth and complete analysis is done on clustering methods, graphical methods, discriminant analysis, supervised/unsupervised methods, and factorial methods. Mathematics is shown in detail as well as great figures, equations and diagrams of select data. Chapters are also included on modeling and quality assurance and practice. } } @book{Strouf:chemical, author = {O. Strouf}, title = {Chemical Pattern Recognition}, publisher = {John Wiley and Sons, Inc.}, address = {Letchworth, England}, year = {1986}, annote = {Strouf offers a very indepth analysis of chemical patterns from methods to classification techniques. It also includes a chapter on visualization of the data including plots, mappings, and transformations. Finally a chapter includes recent trends in chemical pattern recognition, although the book was written in 1986. } } @TechReport{Vilo:Expression, author = "Jaak Vilo", title = "{Expression Profiler}", type = "{set of tools for gene expression data}", institution = "World Wide Web", address = "http://ep.ebi.ac.uk", year = "21", annote = "This is a web page which offers tools for the clustering, analysis and visualization of gene expression and sequence data. It also provides many links to web sites with additional information on this subject." } @book{man.tang.ea:genetic, author = {K.F. Man and K.S. Tang and S. Kwong}, title = {Genetic Algorithms Concepts and Designs.}, publisher = {Springer-Verlag London Limited}, address = {Great Britian}, year = {1999}, annote = {This is an advanced textbook which presents information about several genetic algorithms. There are 29 figures and mathematical models included for visual clarification. Topics covered include class distribution data (for example, cancer), neural networks, gene manipulation (turning on or off) and problems in the area of control and signal processing of genes. There is also a computer disk at the back of the book that contains an interactive genetic algorithm demonstration. }} @book{mostow:genetic, author = {G.D. Mostow, Editor}, title = {Mathematical Models for Cell Rearrangement.}, publisher = {The Murray Printing Co.}, address = { Forge Village, Mass.}, year = {1975}, annote = {This book deals with the use of mathematical modeling to examine cell movement in biological tissue. Also covered are ways to model the processes of sorting out, invasion, and cell aggregation. Pattern recognition and reversal of cells is also explained, using the mechanism of Kinetics.}} @mastersthesis{Cone:applications, author = {Gina Leyba Cone.}, title = {Applications of Genetic Algorithms in Vector Quantization.}, school = {Colorado State University.}, address = {Fort Collins, Co.}, year = {1995}, month = {}, annote = {This thesis was written by the author to satisfy a requirement for the Master of Science degree in Electrical Engeneering.The author goes into great detail to define and demonstrate computer programs using various genetic algorithms. Such things as "cross-over functions" are mentioned in the text. There is a detailed strategy for using Mathematics, Computer Science, and Engineeing techniques to determine mutation in genes.},} @phdthesis{dearaujo:statistically, author = {Jose Vicente Granato {de Araujo}}, title = {A Statistically Based Procedure for Calibration of Water Distribution Systems}, school = {Oklahoma State University}, year = {1992}, address = {Stillwater, Oklahoma}, month = {May}, annote = {This Ph.D. thesis discusses a statistically based calibration method for water distribution systems. The author gives an in-depth analysis of the calibration procedure discussing analytical methods, optimization methods, and uncertainty analysis for estimating demands and C-factors. A linear regression technique for estimating the C-factors is discussed. Also, a procedure for transferring uncertainties in input data to the parameter estimation is explained.},} @unpublished{reese.lochmuller:factor, author = {C. Reese and C. Lochmuller}, title = {Introduction to Factor Analysis}, note = {web page www.chem.duke.edu/reese/tutor1/factucmp.html}, annote = {A tutorial designed to provide the basic understanding of the principles of factor analysis with a focus on the analysis of a \textit{factor space} or \textit{data space}. A geometric approach is used to aid in the visualization of the linear algebra involved in the problem solutions. Spectra spaces, vector plots, factor analysis, error and rank analysis and target testing are discussed.}} @book{thuraisingham:data, author = {B. Thuraisingham}, title = {Data Mining: Technologies, Techniques, Tools, and Trends}, publisher = {CRC Press}, address = {Boca Raton, Florida}, year = {1999}, annote = {This book is an overview of data mining's uses, methods, technologies, and commercial products. The book defines data mining as "the process of posing various queries and extracting useful information, patterns, and trends often previously unknown from large quantities of data possibly stored in databases." The book works on the assumption that one needs some technical background to understand the algorithms employed in data mining. Part I provides the background material for data mining and includes discussions of database systems, data warehousing, statistical reasoning, machine learning, and parallel processing. Part II describes the techniques and tools for data mining. These are the algorithms employed for doing the mining. These include neural networks, decision trees, and genetic algorithms and inductive logic programming. Part III discusses trends and directions in data mining. It includes discussion of the mining of legacy databases, multiple media and the web.} } @book{everitt:cluster, author = {B. Everitt}, title = {Cluster Analysis}, publisher = {Halsted Press}, address = {New York, NY}, year = {1993}, annote = {A highly readable introduction to cluster analysis. Begins the study of clustered data with graphical displays of data. Scatterplots, bubble plots, Andrew's plots, stars plots and principal component plots are presented as suitable ways of presenting multivariate data graphically. Measurement of similarity, dissimilarity and distance, heirarchical clustering techniques and optimization methods for cluster analysis are discussed. The book concludes by presenting mixture models, density search clustering, clumping, constrained classification and simultaneous clustering.} } @book{wickens:geometry, author = {T. Wickens}, title = {The Geometry of Multivariate Statistics}, publisher = {Lawrence Erlbaum Associates}, address = {Hillsdale, New Jersey}, year = {1995}, annote = {This book is designed to help the reader think about multivariate statistics. To apply the traditional algebraic and comptational approaches to multivariate statistical theory, one needs a way to conceptualize the multivariate relationships among the variables. As Wickens says, "Problems that involve many variables require a deeper understanding than is typically provided by the formal equations or the computer programs." So, this book presents most of the important of multivariate statistics geometrically. The book starts with a discussion of variable space and subject-space. Vector geometry is reviewed with a geometric presentation of the major concepts of linear algebra. Bivariate linear regression and multiple regression are introduced and some of the common complications such as near multicollinearity and orthogonality are discussed. Hypothesis testing and removing the efficts of a variable or a set of variables is presented as a projection into the orthogonal complement of the space generated by the removed variable. The analysis of variance is presented as the framework for the analysis of covariance and the multivariate analysis of variance. The book concludes with chapters on principal component analysis, factor analysis and canonical correlation.}} @article{gee.maron.ames:detection, author = "P. Gee, D. Maron and B. Ames", title = "Detection and classification of mutagens: A set of base-specific Salmonella tester strains", journal = "Proc. Natl. Acad. Sci USA", year = "1994", volume = "91", pages = "1166--1161", annote = "Point mutations in human oncogenes or tumor suppressor genes may lead to cancer, so the detection of mutagens and determination of the types of mutation induced are of importance to the understanding of the disease. This article describes six strains of Salmonella that have been developed to identify the six possible base pair substitution mutations. Each of the six strains restores, by only one specific base-pair substitution, a mutant \textit{his} gene to the wild type so that the cell can grow and form a colony without histidine. The number of colonies formed is a direct measure of the mutagenetic potential of the test compound." } @techreport{dmiller.arao.ea:global, author = {David Miller and Ajit Rao and Kenneth Rose and Allen Gersho}, title = {A Global Optimization Technique for Statistical Classifier Design}, year = {21}, institution = {University of California, Santa Barbara}, annote = {This paper discusses classifier designs that attempt to minimize the pProbability of misclassification. It points out the limitations inherent in conventional techniques such as nearest neighbor and attempts to address those limitations with alternate designs. Several of the techniques utilize linear and non-linear programming. There are also some nice graphics illustrating the concepts} } @phdthesis{tgraepel:statistical, author = {Thore Graepel}, title = {Statistical Physics of Clustering Algorithms}, school = {Technische Universitat Berlin}, year = {1998}, address = {Berlin, Germany}, month = {April}, annote = {This Ph.D. thesis discusses some background information on supervised learning, unsupervised learning and reinforcement learning. Clustering techniques are discussed including various methods. Several specific implementations are derived and explored.}, } @TechReport{tminka:terms, author = "Thomas Minka", title = "A Statistical Learning/Pattern Recognition Glossary", year = "21", type = "{Material on the World Wide Web}", institution = "MIT", address="{http://www-white.media.mit.edu/~tpminka/statlearn/glossary/glossary.html}", annote = {This web site defines many terms used in classifier design and provides links to resources regarding those terms.}, } @TechReport{mmeila.amoore:learn, author = "Marina Meila and Andrew W. Moore", title = "Statistical Learning in High Dimensions", year = "1999", type = "{Material on the World Wide Web}", institution = "Carnegie Mellon", address="{http://www.cs.cmu.edu/~mmp/workshop-nips99/workshop-nips99.html\#description}", annote = {This web site discusses the problems associated with high dimensional data and several approaches to ameliorate the issues.}, } @techreport{rbar-or.tweinstein.cvanwoudenberg:pca, author = {Raphael Bar-Or and Tessa Weinstein and Christiaan van Woudenberg}, title = {Image Decomposition using Principle Component Analysis}, year = "2", institution = {University of Colorado, Denver}, annote = {This paper discusses the implementation of PCA to the decomposition of two dimensional images. It touches on the notion of training and devlops the theory on how PCA is implemented. Source code in Mathematica is provided.}, } @article{talam.kalam:chemometric, author = "Todd M. Alam and Kathleen Alam", title = "Chemometric Analysis of Nuclear Magnetic Resonance Spectroscopy Data", journal = "Spectroscopy", year = "21", volume = "16", pages = "18-27", annote = "{This article describes the application of chemometric techniques to the classification and quantification of compounds based on their NMR spectra. It is a survey of many techniques and their applicability to NMR data. The authors point out the benefits, drawbacks and assumptions of PCA, ICA and three-way techniques.}", } @TechReport{tlee:ica, author = "Te-Won Lee", title = "Introduction to Independent Component Analysis", year = "21", type = "{Material on the World Wide Web}", institution = "SALK Institute", address="{http://www.cnl.salk.edu/~tewon/ICA/intro.html}", annote = {This web site discusses the differences between ICA and PCA and their respective features. It points out several application areas and gives a nice survey of people woking in the field of ICA}, } @TechReport{rduda:pattern, author = "Richard O. Duda", title = "Pattern Recognition for HCL", year = "1997", type = "{Material on the World Wide Web}", institution = "San Jose State University", address="{http://www.engr.sjsu.edu/~knapp/HCIRODDPR/PR\_home.htm}", annote = {This web site is a presenation of a simple classifier, and the steps associated with implementation. It defines several terms including how they are calculated. The terms include mahalanobis and covariance}, } @book{saxler:linear, author = {Sheldon Axler}, title = {Linear Algebra Done Right}, publisher = {Springer}, year = {1997}, annote ={This book is a very readable general graduate level linear algebra reference}, }