(opens in new tab)<\/span><\/a> to facilitate future research.<\/p>\n<\/div>\n<\/p>\n","protected":false},"excerpt":{"rendered":"
It is well known today that pages on the Web contain a large number of content-rich relational tables. Such tables have been systematically extracted in a number of efforts to empower important applications such as table search and schema discovery. However, a significant fraction of relational tables are \\emph{not} embedded in the standard HTML table […]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","footnotes":""},"msr-content-type":[3],"msr-research-highlight":[],"research-area":[13563],"msr-publication-type":[193716],"msr-product-type":[],"msr-focus-area":[],"msr-platform":[],"msr-download-source":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[],"msr-conference":[],"msr-journal":[],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-167873","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-data-platform-analytics","msr-locale-en_us"],"msr_publishername":"","msr_edition":"","msr_affiliation":"","msr_published_date":"2015-5-1","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"204374","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"file","viewUrl":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Main.pdf","id":"204374","title":"Main.pdf","label_id":"243109","label":0},{"type":"file","viewUrl":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Webtables_10k_testset_Readme.txt","id":"204376","title":"Webtables_10k_testset_Readme.txt","label_id":"243109","label":0},{"type":"file","viewUrl":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Webtables_10k_testset.txt","id":"204375","title":"Webtables_10k_testset.txt","label_id":"243109","label":0}],"msr_related_uploader":"","msr_attachments":[{"id":204376,"url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Webtables_10k_testset_Readme.txt"},{"id":204375,"url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Webtables_10k_testset.txt"},{"id":204374,"url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2016\/02\/Main.pdf"}],"msr-author-ordering":[{"type":"text","value":"Xu Chu","user_id":0,"rest_url":false},{"type":"user_nicename","value":"Yeye He","user_id":34992,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Yeye He"},{"type":"user_nicename","value":"Kaushik Chakrabarti","user_id":32503,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Kaushik Chakrabarti"},{"type":"user_nicename","value":"Kris Ganjam","user_id":32579,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Kris Ganjam"}],"msr_impact_theme":[],"msr_research_lab":[],"msr_event":[],"msr_group":[957177],"msr_project":[169513],"publication":[],"video":[],"download":[],"msr_publication_type":"inproceedings","related_content":{"projects":[{"ID":169513,"post_title":"Data Cleaning","post_name":"data-cleaning","post_type":"msr-project","post_date":"2002-07-01 16:21:12","post_modified":"2017-06-06 10:55:49","post_status":"publish","permalink":"https:\/\/www.microsoft.com\/en-us\/research\/project\/data-cleaning\/","post_excerpt":"Poor data quality is a well-known problem in data warehouses that arises for a variety of reasons such as data entry errors and differences in data representation among data sources. For example, one source may use abbreviated state names while another source may use fully expanded state names. However, high quality data is essential for accurate data analysis. Data cleaning is the process of detecting and correcting errors and inconsistencies in data. Goal Typical data…","_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/169513"}]}}]},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/167873","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":3,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/167873\/revisions"}],"predecessor-version":[{"id":756937,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/167873\/revisions\/756937"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=167873"}],"wp:term":[{"taxonomy":"msr-content-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-content-type?post=167873"},{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=167873"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=167873"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=167873"},{"taxonomy":"msr-product-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-product-type?post=167873"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=167873"},{"taxonomy":"msr-platform","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-platform?post=167873"},{"taxonomy":"msr-download-source","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-download-source?post=167873"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=167873"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=167873"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=167873"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=167873"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=167873"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=167873"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=167873"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}