@inproceedings{28a8b935476f440293c083f58f824826,
title = "Semi-supervised text classification from unlabeled documents using classassociated words",
abstract = "Automatically classifying text documents is an important field in machinelearning. Unsupervised text classification does not need training data but isoften criticized to cluster blindly. Supervised text classification needs largequantities of labeled training data to achieve high accuracy. However, inpractice, labeled samples are often difficult, expensive or time consuming toobtain. In the meanwhile, unlabeled documents can be collected easily owing tothe rapid developing Internet. Class associated words are the words whichrepresent the subject of classes and provide prior knowledge of classificationfor training a classifier. A learning algorithm, based on the combination ofExpectation-Maximization (EM) and a Na{\"i}ve Bayes classifier, is introducedto classify documents from fully unlabeled documents using class associatedwords. Experimental results show that it has good classification capability withhigh accuracy, especially for those categories with small quantities ofsamples. In the algorithm, class associated words are used to set classificationconstraints during learning process to restrict to classify documents intocorresponding class labels and improve the classification accuracy.",
keywords = "Class associated words, Expectation-maximization, Na{\"i}ve bayes, Semi-supervised, Text classification",
author = "Han, {Hong Qi} and Zhu, {Dong Hua} and Wang, {Xue Feng}",
year = "2009",
doi = "10.1109/iccie.2009.5223918",
language = "English",
isbn = "9781424441365",
series = "2009 International Conference on Computers and Industrial Engineering, CIE 2009",
publisher = "IEEE Computer Society",
pages = "1255--1260",
booktitle = "2009 International Conference on Computers and Industrial Engineering, CIE 2009",
address = "United States",
note = "2009 International Conference on Computers and Industrial Engineering, CIE 2009 ; Conference date: 06-07-2009 Through 09-07-2009",
}