@article{scholars17581, title = {An Empirical Study on Data Sampling Methods in Addressing Class Imbalance Problem in Software Defect Prediction}, doi = {10.1007/978-3-031-09070-7{$_4$}{$_9$}}, volume = {501 LN}, note = {cited By 1; Conference of 11th Computer Science On-line Conference, CSOC 2022 ; Conference Date: 26 April 2022 Through 26 April 2022; Conference Code:280319}, pages = {594--610}, publisher = {Springer Science and Business Media Deutschland GmbH}, journal = {Lecture Notes in Networks and Systems}, year = {2022}, isbn = {9783031090691}, author = {Odejide, B. J. and Bajeh, A. O. and Balogun, A. O. and Alanamu, Z. O. and Adewole, K. S. and Akintola, A. G. and Salihu, S. A. and Usman-Hamza, F. E. and Mojeed, H. A.}, issn = {23673370}, abstract = {With the growing rate of software systems and their applications in diverse walks of life, developing a software system that has no defects is a subject that cannot be overemphasized. Detection of software defects is one of the most prominent difficulties in the area of software engineering (SE) or software development process. Defects are usually unconscious flaws that make the software system behave unexpectedly or contrary to the specified requirements. This has made the subject of software defect prediction (SDP) a very critical one. Due to their dynamism, SDP solutions based on machine learning (ML) methods are envisaged as a viable approach. However, the latent data quality problem is a significant challenge to developing effective SDP models. The class imbalance is a classic example of the data quality problem in which there is a huge differential in the number of class (majority and minority) labels. Findings from studies have shown that data sampling methods are capable of addressing the class imbalance problem. Hence, this study conducts an empirical comparative analysis on the effect of data sampling methods in addressing the class imbalance problem inherent in SDP. Specifically, the performance of five data sampling (oversampling techniques (SMOTE, ADASYN, and ROS) and undersampling techniques (RUS and NM) methods on four software defect datasets with varying granularities are investigated. As prediction models, decision tree (DT) and random forest (RF) classifiers are deployed as well. Predictive performances of developed models were evaluated using accuracy, the area under the curve (AUC), and Matthews correlation coefficient (MCC) values. Observations from the experimental results showed that the introduction of data sampling methods in SDP processes not only addresses the class imbalance problem but also improves the prediction performances of the experimented classifiers. In addition, models based on ROS resampled datasets had superior predictive performance compared with other studied data sampling-based datasets. In conclusion, it can therefore be recommended to deploy data sampling methods, particularly oversampling methods in SDP processes and other applicable machine learning tasks. {\^A}{\copyright} 2022, The Author(s), under exclusive license to Springer Nature Switzerland AG.}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85135080125&doi=10.1007\%2f978-3-031-09070-7\%5f49&partnerID=40&md5=a9a71b38bad17ecba949f800c4db5a84} }