@inproceedings{chu2015tegra,
author = {Chu, Xu and He, Yeye and Chakrabarti, Kaushik and Ganjam, Kris},
title = {TEGRA: Table Extraction by Global Record Alignment},
booktitle = {International Conference on Management of Data (SIGMOD)},
year = {2015},
month = {May},
abstract = {It is well known today that pages on the Web contain a large number of content-rich relational tables. Such tables have been systematically extracted in a number of efforts to empower important applications such as table search and schema discovery. However, a significant fraction of relational tables are \emph[not] embedded in the standard HTML table tags, and are thus difficult to extract. In particular, a large number of relational tables are known to be in a ``list'' form, which contains a list of clearly separated rows that are not separated into columns.

In this work, we address the important problem of automatically extracting multi-column relational tables from such lists. Our key intuition lies in the simple observation that in correctly-extracted tables, values in the same column are \emph[coherent], both at a syntactic and at a semantic level. Using a background corpus of over 100 million tables crawled from the Web, we quantify semantic coherence based on a statistical measure of value co-occurrence in the same column from the corpus. We then model table extraction as a principled optimization problem -- we allocate tokens in each row sequentially to a fixed number of columns, such that the sum of coherence across all pairs of values in the same column is maximized. Borrowing ideas from $A^\star$ search and metric distance, we develop an efficient 2-approximation algorithm. We conduct large-scale table extraction experiments using both real Web data and proprietary enterprise spreadsheet data. Our approach considerably outperforms the state-of-the-art approaches in terms of quality, achieving over 90\% F-measure across many cases.

Our benchmark data has been made available on GitHub https://github.com/Yeye-He/TEGRA-Table-Segmentation to facilitate future research.},
url = {http://approjects.co.za/?big=en-us/research/publication/tegra-table-extraction-by-global-record-alignment/},
}