2020
Sanoja, Andrés; Garcia, Jean
Manual-design of Blocks: Una Herramienta para Gestionar Segmentaciones Manuales de Páginas Web Journal Article
In: ReVeCom, vol. 06, no. 01, pp. 019-027, 2020, ISSN: 2244-7040 .
@article{SanojaGarcia2020,
title = {Manual-design of Blocks: Una Herramienta para Gestionar Segmentaciones Manuales de Páginas Web},
author = {Andrés Sanoja and Jean Garcia},
editor = {Sociedad Venezolana de Informática},
url = {https://www.researchgate.net/publication/340221556_Manual-design_of_Blocks_Una_Herramienta_para_Gestionar_Segmentaciones_Manuales_de_Paginas_Web_Revista_Venezolana_de_Computacion},
issn = {2244-7040 },
year = {2020},
date = {2020-03-01},
urldate = {2020-03-01},
journal = {ReVeCom},
volume = {06},
number = {01},
pages = {019-027},
abstract = {Web page segmentation is an important task in Web page analysis. The objective is to divide a Web page intoblocks, each one representing a coherent part (or segment) of the content. In this work we describe the developmentof the Manual-design of Blocks (MoB). At the same time we describe how to get a ground truth of segmentations andhow to compute theâbest manual segmentationâ. The best manual segmentation is defined based on our experience andthe data obtained, in this investigation we define one way to obtain it, but we do not consider thereâs only one way to achieve this. The best segmentation is then available to be used on the evaluation process of segmentation algorithmusing the Block-o-Matic framework. Also, a Web API and a Web repository for managing the data. Acceptance testresults are presented in this document.
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
GarcÃa, Jean Pearre; Sanoja, Andres
Desarrollo de una Herramienta Interactiva para la Construcción de un âGround Truthâ de Segmentaciones de Páginas Web Technical Report
Universidad Central de Venezuela 2018.
@techreport{GarcÃaSanoja2018c,
title = {Desarrollo de una Herramienta Interactiva para la Construcción de un âGround Truthâ de Segmentaciones de Páginas Web},
author = {Jean Pearre GarcÃa and Andres Sanoja},
editor = {UCV Computación},
url = {https://www.researchgate.net/publication/325620005_Desarrollo_de_una_Herramienta_Interactiva_para_la_Construccion_de_un_Ground_Truth_de_Segmentaciones_de_Paginas_Web},
doi = {10.13140/RG.2.2.14098.35529},
year = {2018},
date = {2018-05-01},
institution = {Universidad Central de Venezuela},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
2017
Sanoja, Andrés; Gançarski, Stéphane
Migrating Web Archives from HTML4 to HTML5: A Block-Based Approach and Its Evaluation Book Chapter
In: Kirikova, MÄrÄ«te; ørvaag, Kjetil; Papadopoulos, George A (Ed.): pp. 375–393, Springer International Publishing, Cham, 2017, ISBN: 978-3-319-66917-5.
@inbook{Sanoja2017,
title = {Migrating Web Archives from HTML4 to HTML5: A Block-Based Approach and Its Evaluation},
author = {Andrés Sanoja and Stéphane Gançarski},
editor = {MÄrÄ«te Kirikova and Kjetil ørvaag and George A Papadopoulos},
url = {https://doi.org/10.1007/978-3-319-66917-5_25},
doi = {10.1007/978-3-319-66917-5_25},
isbn = {978-3-319-66917-5},
year = {2017},
date = {2017-09-24},
pages = {375--393},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Web archives (and the Web itself) are likely to suffer from format obsolescence. In a few years or decades, future Web browsers will no more be able to properly render Web pages written in HTML4 format. Thus we propose a migration tool from HTML4 to HTML5. This is challenging, because it requires to generate HTML5 semantic elements that do not exist in HTML4 pages. To solve this issue, we propose to use a Web page segmenter. Indeed, blocks generated by a segmenter are good candidates for being semantic elements as both reflect the content structure of the page. We use an evaluation framework for Web page segmentation, that helps defining and computing relevant metrics to measure the quality of the migration process. We ran experiments on a sample of 40 pages. The migrated pages we produce are compared to a ground truth. The automatic labeling of blocks is quite similar to the ground truth, though its quality depends on the type of page we migrate. When comparing the rendering of the original page and the rendering of its migrated version, we note some differences, mainly due to the fact that rendering engines do not (yet) properly render the content of semantic elements.},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
2016
Sanoja, Andrés; Gançarski, Stéphane
Block-based Migration from HTML4 Standard to HTML5 Standard in the Context of Web Archives Journal Article
In: Revista Venezolana de Computación, vol. 3, no. 1, 2016, ISSN: 2244-7040.
@article{ReVeCom11772,
title = {Block-based Migration from HTML4 Standard to HTML5 Standard in the Context of Web Archives},
author = {Andrés Sanoja and Stéphane Gançarski},
url = {http://saber.ucv.ve/ojs/index.php/rev_vcomp/article/view/11772},
issn = {2244-7040},
year = {2016},
date = {2016-01-01},
journal = {Revista Venezolana de Computación},
volume = {3},
number = {1},
abstract = {Web archives are not exempt of format obsolescence. In the near future Web pages written in HTML4 format,could be obsolete. We will have to choose between two preservation strategies: emulation or migration. The first optionis the most evident, however due to the size of the Web and the amount of information that Web archives handle it isnot practical. In the other hand migration to HTML5 format seems plausible. This is a challenge because we need tomodify a page (in HTML4 format) and include elements that not even exists in this format (as the HTML5 semanticelements). Using the Web page segmentation we show that, with the appropriate granularity, blocks look alike thesesemantic elements. We present the use our segmentation tool, BoM (Block-o-Matic), for helping achieve the migrationof Web pages from HTML4 format to HTML5 format in the context of Web archives. We also present an evaluationframework for Web page segmentation, that helps to produce metrics needed to compare the original and migrated version.If both versions are similar the migration has been successful. We show the experiments and results obtained on a sampleof 40 pages. We made the manual segmentations for each page using our MoB tool. Results shows that in the migrationprocess there is no data loss but in the migrated version (after adding the semantic elements) the margin is changed. Thisis, it adds whitespace that change the elements position, shifting elements slightly on the page. While this is imperceptibleto the human eye, for systems it is difficult to handle without previous knowledge of this situation.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2015
Sanoja, Andrés; Gançarski, Stéphane
Web page segmentation evaluation Proceedings Article
In: Proceedings of the 30th Annual ACM Symposium on Applied Computing, pp. 753–760, ACM 2015.
@inproceedings{SanGan:SAC:2015,
title = {Web page segmentation evaluation},
author = {Andrés Sanoja and Stéphane Gançarski},
year = {2015},
date = {2015-03-01},
booktitle = {Proceedings of the 30th Annual ACM Symposium on Applied Computing},
pages = {753--760},
organization = {ACM},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sanoja, Andrés
Web Page Segmentation, Evaluation and Applications PhD Thesis
Université Pierre et Marie Curie-Paris VI, 2015.
@phdthesis{Sanoja:LIP6:2015,
title = {Web Page Segmentation, Evaluation and Applications},
author = {Andrés Sanoja},
editor = {UPMC},
url = {https://hal.inria.fr/tel-01128002/},
year = {2015},
date = {2015-01-22},
address = {4 place Jussieu, 75005. Paris, France},
school = {Université Pierre et Marie Curie-Paris VI},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2014
Sanoja, Andrés; Gançarski, Stéphane
Block-o-Matic: A web page segmentation framework Proceedings Article
In: International Conference onMultimedia Computing and Systems (ICMCS), pp. 595-600, Marrakesh, Moroco, 2014.
@inproceedings{Sanoja:ICMCS:2014,
title = {Block-o-Matic: A web page segmentation framework},
author = {Andrés Sanoja and Stéphane Gançarski},
year = {2014},
date = {2014-04-01},
booktitle = {International Conference onMultimedia Computing and Systems (ICMCS)},
pages = {595-600},
address = {Marrakesh, Moroco},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2013
Sanoja, Andrés; Gançarski, Stéphane
Block-o-Matic: a Web Page Segmentation Tool and its Evaluation Miscellaneous
29`eme journées ''Base de données avancées'', BDA'13, 2013, (Poster).
@misc{sanoja:hal-00881693,
title = {Block-o-Matic: a Web Page Segmentation Tool and its Evaluation},
author = {Andrés Sanoja and Stéphane Gançarski },
url = {https://hal.archives-ouvertes.fr/hal-00881693},
year = {2013},
date = {2013-01-01},
abstract = {In this paper we present our prototype for the web page segmentation called Block-o-matic and its counterpart Block-o-manual, for manual segmentation. The main idea is to evaluate the correctness of the segmentation algorithm. Build a ground truth database for evaluation can take days or months depending on the collection size, however we address our solution with our manual segmentation tool intended to minimize the time of annotation of blocks in web pages. Both tools implements the same rules for segmentation, for the manual version allows to propose blocks to assessor and for the automatic the block selection. We present our demonstration scenario with a collection of web pages organized in categories. After its annotation they are compared with the automatic segmentation version and it is given a score and a visual comparison.},
howpublished = {29`eme journées ''Base de données avancées'', BDA'13},
note = {Poster},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2012
Sanoja, Andrés; Gançarski, Stéphane
Yet Another Hybrid Segmentation Tool Miscellaneous
iPRES 2012 -- 9 th International Conference on Preservation of Digital Objects, 2012, (Poster).
@misc{sanoja:hal-00770527,
title = {Yet Another Hybrid Segmentation Tool},
author = {Andrés Sanoja and Stéphane Gançarski},
url = {https://hal.archives-ouvertes.fr/hal-00770527},
year = {2012},
date = {2012-01-01},
abstract = {In this paper1 we present an overview of a prototype we are developing for in the context of web archives (page comparison, crawling and information retrieval). It analyses pages based on their DOM tree information and their visual rendering. This tool implements a modiï¬ed version of VIPS with the aim of enhancing the precision of visual block extraction and the hierarchy construction. First, the visual rendering of a page, produced by several browsers, is segmented into rectangular blocks. Then, the extracted blocks are analysed looking for visual overlaps, which are analysed using a adapted version of the XY-Cut algorithm and resolve the overlap. As a result we may have diï¬erent shapes of blocks, rectangular and non-rectangular blocks. Finally, the visual block tree, representing the layout of the page is analysed in order to have a more coherent layout disposition.},
howpublished = {iPRES 2012 -- 9 th International Conference on Preservation of Digital Objects},
note = {Poster},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}