@article{1433, author = {Asmaa Benghabrit, Brahim Ouhbi, Hicham Behja, Bouchra Frikh}, title = {Statistical and Semantic Feature Selection for Text Clustering}, journal = {Journal of Intelligent Computing}, year = {2013}, volume = {4}, number = {2}, doi = {}, url = {http://www.dline.info/jic/fulltext/v4n2/3.pdf}, abstract = {Organizing textual documents by categorizing them is important and beneficial for information retrieval; but when it comes to clustering documents containing a huge number of terms, the task become challenged. Therefore, selecting effective features is essential for reducing the feature space dimensionality and improving the clustering performances. While numerous methods have been developed for this purpose, fewer techniques considered the semantic knowledge that can be incorporate into the clustering process. This paper proposes first a new semantic feature selection method SIM based on the mutual information metric, and second a novel two phase clustering mechanism. The statistical feature selection method CHIR integrates into the frequency clustering stage and then our technique SIM is used in the second stage to pilot the semantic categorization. The content based analysis allows enhancing the frequency clustering by taking the semantic relationships between the features into account. The successful evaluation of our approach demonstrates its relevancy in catching statistical and semantic pertinent features that enable better clustering accuracy in terms of F-measure and purity.}, }