@article{4717, author = {Pit Pichappan}, title = {Misinformation Clusters and Virality Dynamics in Online Social Networks: A Topic Modeling and Epidemiological Analysis of Reddit Communities}, journal = {Journal of Information Technology Review}, year = {2026}, volume = {17}, number = {2}, doi = {https://doi.org/10.6025/jitr/2026/17/2/65-79}, url = {https://www.dline.info/jitr/fulltext/v17n2/jitrv17n2_2.pdf}, abstract = {The proliferation of misinformation on social media poses significant societal risks, necessitating robust analytical frameworks. This study presents a comprehensive analysis of misinformation clusters and virality dynamics within Reddit communities using topic modeling and epidemiological diffusion models. Utilizing a longitudinal dataset sourced from the Zenodo repository spanning 2012 to 2024, we employed TF-IDF vectorization and K-means clustering to identify five dominant thematic groups, including Health, US Politics, and General Discussions. Misinformation detection was enhanced with a fine-tuned DistilBERT model, achieving 94-96% accuracy and surpassing traditional logistic regression baselines. Results indicate that while General Discussions contain the highest absolute volume of misinformation, US Politics exhibits the highest density, creating critical risk zones. Virality metrics reveal a credibility paradox: misleading content achieves higher upvote ratios than factual news, suggesting stronger audience resonance despite lower overall reach. Furthermore, an adapted SEIR (Susceptible-Exposed-Infected-Recovered) framework confirms that information spread follows epidemic-like patterns, with political topics acting as infectious nodes and general topics providing susceptible populations until saturation. Explainable AI techniques, specifically LIME, validated model interpretability by highlighting linguistic markers such as "fake" and "hiding." These findings underscore that misinformation is not randomly distributed but concentrated in specific thematic clusters where high virality overlaps with high density. Strategic monitoring prioritizing these high-risk clusters is essential for mitigating information epidemics and developing effective countermeasures against the evolving landscape of online disinformation. Future research should incorporate temporal dynamics into the model to predict viral spikes before saturation occurs.}, }