@article{4767,
  author = {Pit Pichappan},
  title = {Mapping the Contemporary LLM Landscape: A Descriptive Analysis of Benchmark Performance and Capability Stratification},
  journal = {Journal of Information Technology Review},
  year = {2026},
  volume = {17},
  number = {3},
  doi = {https://doi.org/10.6025/jitr/2026/17/3/105-119},
  url = {https://www.dline.info/jitr/fulltext/v17n3/jitrv17n3_1.pdf},
  abstract = {The rapid proliferation of Large Language Models (LLMs) has established benchmark evaluations as the
primary mechanism for assessing model capability and technological progress. However, growing concerns
regarding benchmark validity, data contamination, and the interpretability of aggregate scores highlight a
critical gap in understanding how these metrics reflect the broader LLM ecosystem. This study addresses
this gap by conducting a comprehensive descriptive analysis of benchmark performance and capability
stratification across contemporary LLMs. Utilizing the Comprehensive LLM Benchmark Dataset, comprising
390 model-benchmark observations from 2022 to 2024, we employ descriptive statistics, density estimation,
and performance-tier categorization to map the performance landscape. Our findings reveal a negatively
skewed distribution with a high median but substantial variability, indicating that while baseline
competencies are standardizing, significant capability gaps persist. Furthermore, the analysis identifies
distinct capability strata, with Strong and Top Tier models accounting for over 56 per cent of observations,
yet a substantial proportion of Weak and Moderate performers remain. These results demonstrate that the
contemporary LLM landscape is highly stratified rather than homogeneous. This stratification highlights
the need for delicate evaluation. Ultimately, this research underscores that aggregate benchmark scores
often obscure underlying heterogeneity in capabilities. We conclude that future evaluation frameworks
must evolve toward multidimensional, capability-oriented methodologies to accurately capture model
maturity and real-world utility, providing a foundational baseline for subsequent research on scaling laws
and architectural effectiveness.},
}