@article{f7b39f8e311043979b564404cff1206b,
title = "Parallel Bayesian Additive Regression Trees",
abstract = "Bayesian additive regression trees (BART) is a Bayesian approach to flexible nonlinear regression which has been shown to be competitive with the best modern predictive methods such as those based on bagging and boosting. BART offers some advantages. For example, the stochastic search Markov chain Monte Carlo (MCMC) algorithm can provide a more complete search of the model space and variation across MCMC draws can capture the level of uncertainty in the usual Bayesian way. The BART prior is robust in that reasonable results are typically obtained with a default prior specification. However, the publicly available implementation of the BART algorithm in the R package BayesTree is not fast enough to be considered interactive with over a thousand observations, and is unlikely to even run with 50,000 to 100,000 observations. In this article we show how the BART algorithm may be modified and then computed using single program, multiple data (SPMD) parallel computation implemented using the Message Passing Interface (MPI) library. The approach scales nearly linearly in the number of processor cores, enabling the practitioner to perform statistical inference on massive datasets. Our approach can also handle datasets too massive to fit on any single data repository.",
keywords = "Big Data, Markov chain Monte Carlo, Nonlinear, Scalable, Statistical computing",
author = "Pratola, {Matthew T.} and Chipman, {Hugh A.} and Gattiker, {James R.} and Higdon, {David M.} and Robert McCulloch and Rust, {William N.}",
note = "Funding Information: Matthew T. Pratola is Assistant Professor, Department of Statistics, The Ohio State University, 1958 Neil Avenue, 404 Cockins Hall, Columbus, OH 43210 (E-mail: mpratola@stat.osu.edu). Hugh A. Chipman is Professor and Canada Research Chair in Mathematical Modeling, Department of Mathematics and Statistics, Acadia University, 12 University Avenue, Huggins Science Hall, Wolfville, NS, Canada B4P 2R6 (E-mail: hugh.chipman@acadiau.ca). James R. Gattiker is Scientist with the Statistical Sciences Group, P.O. Box 1663, MS-F600, Los Alamos National Laboratory, Los Alamos, NM 87545 (E-mail: gatt@lanl.gov). David M. Higdon is Group Leader, Statistical Sciences Group, P.O. Box 1663, MS F-600, Los Alamos National Laboratory, Los Alamos, NM 87545 (E-mail: dhigdon@lanl.gov). Robert McCulloch is the Katherine Dusak Miller Professor of Econometrics and Statistics, Booth School of Business, University of Chicago, 5807 South Woodlawn Avenue, Chicago, IL 60637 (E-mail: robert.mcculloch@chicagobooth.edu). William N. Rust is Scientist with the Statistical Sciences Group, P.O. Box 1663, MS F-600, Los Alamos National Laboratory, Los Alamos, NM 87545 (E-mail: wnr@lanl.gov). ∗This work was supported in part by the U.S. Department of Energy Office of Science, Office of Advanced Scientific Computing Research, Scientific Discovery through Advanced Computing (SciDAC) program. Publisher Copyright: {\textcopyright} 2014 American Statistical Association, Institute of Mathematical Statistics, and Interface Foundation of North America.",
year = "2014",
month = jul,
day = "3",
doi = "10.1080/10618600.2013.841584",
language = "English (US)",
volume = "23",
pages = "830--852",
journal = "Journal of Computational and Graphical Statistics",
issn = "1061-8600",
publisher = "American Statistical Association",
number = "3",
}