@inproceedings{032eaa6950f44149a2e8059dca1ce815,
title = "Extraction of informative blocks from web pages",
abstract = "Typically Web pages always contain a large amount of banner ads, navigation bars, and copyright notices etc. Such irrelevant information is not part of the main contents of the pages, they will seriously harm Web mining and searching. In this paper, we develop and evaluate a method that utilizes both the visual features and the semantic information to extract informative blocks. We first partition a web page into semantic blocks using vision-based page segmentation. The visual and the semantic information got by LSI (Latent Semantic Indexing) are extracted to form the feature-vector of the block. Second we manually annotate informative or uninformative labels to the blocks. The labeled blocks are used as training dataset to train a classification model. Then the informative blocks can be extracted through the model. Our experiments show that the proposed EIBA (Extract Informative Block Arithmetic) is able to dramatically improve the results in near-duplicate detection and classification tasks.",
keywords = "Data mining, Information extraction, LSI, SVM, VIPS, Web, Web page segmentation",
author = "Cao, {Yu Juan} and Niu, {Zhen Dong} and Dai, {Liu Ling} and Zhao, {Yu Ming}",
year = "2008",
doi = "10.1109/ALPIT.2008.106",
language = "English",
isbn = "9780769532738",
series = "Proceedings - ALPIT 2008, 7th International Conference on Advanced Language Processing and Web Information Technology",
pages = "544--549",
booktitle = "Proceedings - ALPIT 2008, 7th International Conference on Advanced Language Processing and Web Information Technology",
note = "ALPIT 2008, 7th International Conference on Advanced Language Processing and Web Information Technology ; Conference date: 23-07-2008 Through 25-07-2008",
}