@article{3882c2eb436a45eabaa10613c5410960,
title = "Analysis of sampling techniques for imbalanced data: An n=648 ADNI study",
abstract = "Many neuroimaging applications deal with imbalanced imaging data. For example, in Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset, the mild cognitive impairment (MCI) cases eligible for the study are nearly two times the Alzheimer's disease (AD) patients for structural magnetic resonance imaging (MRI) modality and six times the control cases for proteomics modality. Constructing an accurate classifier from imbalanced data is a challenging task. Traditional classifiers that aim to maximize the overall prediction accuracy tend to classify all data into the majority class. In this paper, we study an ensemble system of feature selection and data sampling for the class imbalance problem. We systematically analyze various sampling techniques by examining the efficacy of different rates and types of undersampling, oversampling, and a combination of over and undersampling approaches. We thoroughly examine six widely used feature selection algorithms to identify significant biomarkers and thereby reduce the complexity of the data. The efficacy of the ensemble techniques is evaluated using two different classifiers including Random Forest and Support Vector Machines based on classification accuracy, area under the receiver operating characteristic curve (AUC), sensitivity, and specificity measures. Our extensive experimental results show that for various problem settings in ADNI, (1) a balanced training set obtained with K-Medoids technique based undersampling gives the best overall performance among different data sampling techniques and no sampling approach; and (2) sparse logistic regression with stability selection achieves competitive performance among various feature selection algorithms. Comprehensive experiments with various settings show that our proposed ensemble model of multiple undersampled datasets yields stable and promising results.",
keywords = "Alzheimer's disease, Classification, Feature selection, Imbalanced data, Oversampling, Undersampling",
author = "Rashmi Dubey and Jiayu Zhou and Yalin Wang and Thompson, {Paul M.} and Jieping Ye",
note = "Funding Information: Data collection and sharing for this project was funded by the Alzheimer's Disease Neuroimaging Initiative (ADNI) ( National Institutes of Health grant U01 AG024904 ). ADNI is funded by the National Institute on Aging , the National Institute of Biomedical Imaging and Bioengineering , and through the generous contributions from the following: Abbott; Alzheimer's Association; Alzheimer's Drug Discovery Foundation; Amorfix Life Sciences Ltd.; AstraZeneca; Bayer HealthCare; BioClinica, Inc.; Biogen Idec Inc.; Bristol-Myers Squibb Company; Eisai Inc.; Elan Pharmaceuticals Inc.; Eli Lilly and Company; F. Hoffmann-La Roche Ltd and its affiliated company Genentech, Inc.; GE Healthcare; Innogenetics, N.V.; Janssen Alzheimer Immunotherapy Research & Development, LLC.; Johnson & Johnson Pharmaceutical Research & Development LLC.; Medpace, Inc.; Merck & Co., Inc.; Meso Scale Diagnostics, LLC.; Novartis Pharmaceuticals Corporation; Pfizer Inc.; Servier; Synarc Inc.; and Takeda Pharmaceutical Company. The Canadian Institutes of Health Research is providing funds to support ADNI clinical sites in Canada. Private sector contributions are facilitated by the Foundation for the National Institutes of Health ( www.fnih.org ). The grantee organization is the Northern California Institute for Research and Education, and the study is coordinated by the Alzheimer's disease Cooperative Study at the University of California, San Diego. ADNI data are disseminated by the Laboratory for Neuro Imaging at the University of California, Los Angeles. This research was also supported by NIH grants P30 AG010129 , K01 AG030514 , and the Dana Foundation . Funding Information: This work was funded by the National Institute on Aging ( AG016570 to PMT and R21AG043760 to YW), the National Library of Medicine , the National Institute for Biomedical Imaging and Bioengineering, and the National Center for Research Resources ( LM05639 , EB01651 , RR019771 to PMT), the US National Science Foundation (NSF) ( IIS-0812551 , IIS-0953662 to JY), and the National Library of Medicine ( R01 LM010730 to JY). ",
year = "2014",
month = feb,
day = "15",
doi = "10.1016/j.neuroimage.2013.10.005",
language = "English (US)",
volume = "87",
pages = "220--241",
journal = "NeuroImage",
issn = "1053-8119",
publisher = "Academic Press Inc.",
}