@article{scholars17568, title = {Empirical Analysis of Data Sampling-Based Ensemble Methods in Software Defect Prediction}, pages = {363--379}, doi = {10.1007/978-3-031-10548-7{$_2$}{$_7$}}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, year = {2022}, note = {cited By 2; Conference of 22nd International Conference on Computational Science and Its Applications , ICCSA 2022 ; Conference Date: 4 July 2022 Through 7 July 2022; Conference Code:281299}, volume = {13381 }, publisher = {Springer Science and Business Media Deutschland GmbH}, issn = {03029743}, author = {Balogun, A. O. and Odejide, B. J. and Bajeh, A. O. and Alanamu, Z. O. and Usman-Hamza, F. E. and Adeleke, H. O. and Mabayoje, M. A. and Yusuff, S. R.}, keywords = {Barium compounds; Decision trees; Defects; Forecasting, Boosting ensembles; Class imbalance; Data sampling; Ensemble methods; Ensemble techniques; Near-misses; Predictive performance; Sampling method; Sampling technique; Software defect prediction, NASA}, abstract = {This research work investigates the deployment of data sampling and ensemble techniques in alleviating the class imbalance problem in software defect prediction (SDP). Specifically, the effect of data sampling techniques on the performance of ensemble methods is investigated. The experiments were conducted using software defect datasets from the NASA software archives. Five data sampling methods (over-sampling techniques (SMOTE, ADASYN, and ROS), and undersampling techniques (RUS and NearMiss) were combined with bagging and boosting ensemble methods based on Na{\~A}?ve Bayes (NB) and Decision Tree (DT) classifier. Predictive performances of developed models were assessed based on the area under the curve (AUC), and Matthew{\^a}??s correlation coefficient (MCC) values. From the experimental findings, it was observed that the implementation of data sampling methods further enhanced the predictive performances of the experimented ensemble methods. Specifically, BoostedDT on the ROS-balanced datasets recorded the highest average AUC (0.995), and MCC (0.918) values respectively. Aside NearMiss method, which worked best with the Bagging ensemble method, other studied data sampling methods worked well with the Boosting ensemble technique. Also, some of the developed models particularly BoostedDT showed better prediction performance over existing SDP models. As a result, combining data sampling techniques with ensemble methods may not only improve SDP model prediction performance but also provide a plausible solution to the latent class imbalance issue in SDP processes. {\^A}{\copyright} 2022, The Author(s), under exclusive license to Springer Nature Switzerland AG.}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85135913774&doi=10.1007\%2f978-3-031-10548-7\%5f27&partnerID=40&md5=482de431af6c972ed54b22c36335dfb6}, isbn = {9783031105470} }