@article{4769, author = {Puji Lestari, M. Kom}, title = {Explainable Machine Learning for DNA Methylation Prediction: A Comprehensive Interpretability and Biological Analysis}, journal = {Journal of Information Technology Review}, year = {2026}, volume = {17}, number = {3}, doi = {https://doi.org/10.6025/jitr/2026/17/3/148-172}, url = {https://www.dline.info/jitr/fulltext/v17n3/jitrv17n3_3.pdf}, abstract = {and disease pathogenesis. Although machine learning (ML) has shown promise in predicting methylation status from genomic features, many predictive models operate as “black boxes,” limiting biological interpretability and clinical translation. Objective: This study develops an explainable machine learning framework for DNA methylation prediction that integrates predictive modeling with multiple interpretability techniques to identify influential genomic determinants and elucidate their biological relevance. Methods: A supervised classification model was trained on a DNA methylation dataset comprising four explanatory variables CpG density, genomic location, regulatory score, and conservation score with binary methylation status as the target. Global interpretability was achieved using SHAP (SHapley Additive exPlanations) feature importance and summary plots. Local explanations were generated using SHAP waterfall and force plots alongside LIME (Local Interpretable Model Agnostic Explanations). Functional relationships were examined through Partial Dependence Plots (PDP) and Individual Conditional Expectation (ICE) plots. Robustness was assessed via bootstrap resampling (1,000 iterations), five-fold cross-validation, and correlation analysis between SHAP and LIME explanations. Results: CpG density emerged as the most influential predictor (mean absolute SHAP = 0.36), followed by regulatory score (0.24), conservation score (0.13), and genomic location (0.05). SHAP dependence plots revealed nonlinear threshold effects, with CpG density values below 0.3 producing negative contributions and values above 0.7 strongly increasing methylation probability. PDP analyses confirmed saturation behavior at high feature values. Local explanations successfully decomposed individual predictions,demonstrating that high confidence methylation calls (e.g., P = 0.92) were driven primarily by CpG density and regulatory score. Robustness analyses showed strong agreement between SHAP and LIME (Pearson r = 0.97) and low cross validation variability (coefficient of variation <10% for top features). Conclusion: Explainable machine learning provides biologically interpretable insights into DNA methylation determinants while maintaining predictive capability. The consistent identification of CpG density, regulatory activity, and evolutionary conservation as dominant predictors aligns with established epigenetic principles. This framework offers a transparent, reproducible approach for epigenetic biomarker discovery and supports precision medicine applications. Future work should extend these methods to multi-omic datasets and experimental validation.}, }