{
    "content": [
        {
            "type": "text",
            "text": "# HTML::TableParser (perldoc)\n\n## NAME\n\nHTML::TableParser - HTML::TableParser - Extract data from an HTML table\n\n## SYNOPSIS\n\nuse HTML::TableParser;\n@reqs = (\n{\nid => 1.1,                    # id for embedded table\nhdr => \\&header,              # function callback\nrow => \\&row,                 # function callback\nstart => \\&start,             # function callback\nend => \\&end,                 # function callback\nudata => { Snack => 'Food' }, # arbitrary user data\n},\n{\nid => 1,                      # table id\ncols => [ 'Object Type',\nqr/object/ ],       # column name matches\nobj => $obj,                  # method callbacks\n},\n);\n# create parser object\n$p = HTML::TableParser->new( \\@reqs,\n{ Decode => 1, Trim => 1, Chomp => 1 } );\n$p->parsefile( 'foo.html' );\n# function callbacks\nsub start {\nmy ( $id, $line, $udata ) = @;\n#...\n}\nsub end {\nmy ( $id, $line, $udata ) = @;\n#...\n}\nsub header {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}\nsub row  {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}\n\n## DESCRIPTION\n\nHTML::TableParser uses HTML::Parser to extract data from an HTML table. The data is returned via\na series of user defined callback functions or methods. Specific tables may be selected either\nby a matching a unique table id or by matching against the column names. Multiple (even nested)\ntables may be parsed in a document in one pass.\n\n## Sections\n\n- **NAME**\n- **VERSION**\n- **SYNOPSIS**\n- **DESCRIPTION** (5 subsections)\n- **METHODS**\n- **Table Requests** (4 subsections)\n- **BUGS**\n- **SOURCE**\n- **AUTHOR**\n- **COPYRIGHT AND LICENSE**\n\nUse structuredContent.sections for detailed options, examples, and full documentation.\n"
        }
    ],
    "structuredContent": {
        "command": "HTML::TableParser",
        "section": "",
        "mode": "perldoc",
        "summary": "HTML::TableParser - HTML::TableParser - Extract data from an HTML table",
        "synopsis": "use HTML::TableParser;\n@reqs = (\n{\nid => 1.1,                    # id for embedded table\nhdr => \\&header,              # function callback\nrow => \\&row,                 # function callback\nstart => \\&start,             # function callback\nend => \\&end,                 # function callback\nudata => { Snack => 'Food' }, # arbitrary user data\n},\n{\nid => 1,                      # table id\ncols => [ 'Object Type',\nqr/object/ ],       # column name matches\nobj => $obj,                  # method callbacks\n},\n);\n# create parser object\n$p = HTML::TableParser->new( \\@reqs,\n{ Decode => 1, Trim => 1, Chomp => 1 } );\n$p->parsefile( 'foo.html' );\n# function callbacks\nsub start {\nmy ( $id, $line, $udata ) = @;\n#...\n}\nsub end {\nmy ( $id, $line, $udata ) = @;\n#...\n}\nsub header {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}\nsub row  {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}",
        "tldr_summary": null,
        "tldr_examples": [],
        "tldr_source": null,
        "flags": [],
        "examples": [],
        "see_also": [],
        "section_outline": [
            {
                "name": "NAME",
                "lines": 2,
                "subsections": []
            },
            {
                "name": "VERSION",
                "lines": 2,
                "subsections": []
            },
            {
                "name": "SYNOPSIS",
                "lines": 46,
                "subsections": []
            },
            {
                "name": "DESCRIPTION",
                "lines": 5,
                "subsections": [
                    {
                        "name": "Table Identification",
                        "lines": 5
                    },
                    {
                        "name": "Data Extraction",
                        "lines": 24
                    },
                    {
                        "name": "Callback API",
                        "lines": 17
                    },
                    {
                        "name": "Data Cleanup",
                        "lines": 13
                    },
                    {
                        "name": "Data Organization",
                        "lines": 27
                    }
                ]
            },
            {
                "name": "METHODS",
                "lines": 23,
                "subsections": []
            },
            {
                "name": "Table Requests",
                "lines": 13,
                "subsections": [
                    {
                        "name": "Identifying tables to parse",
                        "lines": 109
                    },
                    {
                        "name": "Specifying the data callbacks",
                        "lines": 41
                    },
                    {
                        "name": "Specifying Data cleanup operations",
                        "lines": 4
                    },
                    {
                        "name": "Other Attributes",
                        "lines": 4
                    }
                ]
            },
            {
                "name": "BUGS",
                "lines": 7,
                "subsections": []
            },
            {
                "name": "SOURCE",
                "lines": 3,
                "subsections": []
            },
            {
                "name": "AUTHOR",
                "lines": 2,
                "subsections": []
            },
            {
                "name": "COPYRIGHT AND LICENSE",
                "lines": 6,
                "subsections": []
            }
        ],
        "sections": {
            "NAME": {
                "content": "HTML::TableParser - HTML::TableParser - Extract data from an HTML table\n",
                "subsections": []
            },
            "VERSION": {
                "content": "version 0.43\n",
                "subsections": []
            },
            "SYNOPSIS": {
                "content": "use HTML::TableParser;\n\n@reqs = (\n{\nid => 1.1,                    # id for embedded table\nhdr => \\&header,              # function callback\nrow => \\&row,                 # function callback\nstart => \\&start,             # function callback\nend => \\&end,                 # function callback\nudata => { Snack => 'Food' }, # arbitrary user data\n},\n{\nid => 1,                      # table id\ncols => [ 'Object Type',\nqr/object/ ],       # column name matches\nobj => $obj,                  # method callbacks\n},\n);\n\n# create parser object\n$p = HTML::TableParser->new( \\@reqs,\n{ Decode => 1, Trim => 1, Chomp => 1 } );\n$p->parsefile( 'foo.html' );\n\n\n# function callbacks\nsub start {\nmy ( $id, $line, $udata ) = @;\n#...\n}\n\nsub end {\nmy ( $id, $line, $udata ) = @;\n#...\n}\n\nsub header {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}\n\nsub row  {\nmy ( $id, $line, $cols, $udata ) = @;\n#...\n}\n",
                "subsections": []
            },
            "DESCRIPTION": {
                "content": "HTML::TableParser uses HTML::Parser to extract data from an HTML table. The data is returned via\na series of user defined callback functions or methods. Specific tables may be selected either\nby a matching a unique table id or by matching against the column names. Multiple (even nested)\ntables may be parsed in a document in one pass.\n",
                "subsections": [
                    {
                        "name": "Table Identification",
                        "content": "Each table is given a unique id, relative to its parent, based upon its order and nesting. The\nfirst top level table has id 1, the second 2, etc. The first table nested in table 1 has id 1.1,\nthe second 1.2, etc. The first table nested in table 1.1 has id 1.1.1, etc. These, as well as\nthe tables' column names, may be used to identify which tables to parse.\n"
                    },
                    {
                        "name": "Data Extraction",
                        "content": "As the parser traverses a selected table, it will pass data to user provided callback functions\nor methods after it has digested particular structures in the table. All functions are passed\nthe table id (as described above), the line number in the HTML source where the table was found,\nand a reference to any table specific user provided data.\n\nTable Start\nThe start callback is invoked when a matched table has been found.\n\nTable End\nThe end callback is invoked after a matched table has been parsed.\n\nHeader  The hdr callback is invoked after the table header has been read in. Some tables do not\nuse the <th> tag to indicate a header, so this function may not be called. It is passed\nthe column names.\n\nRow     The row callback is invoked after a row in the table has been read. It is passed the\ncolumn data.\n\nWarn    The warn callback is invoked when a non-fatal error occurs during parsing. Fatal errors\ncroak.\n\nNew     This is the class method to call to create a new object when HTML::TableParser is\nsupposed to create new objects upon table start.\n"
                    },
                    {
                        "name": "Callback API",
                        "content": "Callbacks may be functions or methods or a mixture of both. In the latter case, an object must\nbe passed to the constructor. (More on that later.)\n\nThe callbacks are invoked as follows:\n\nstart( $tblid, $lineno, $udata );\n\nend( $tblid, $lineno, $udata );\n\nhdr( $tblid, $lineno, \\@colnames, $udata );\n\nrow( $tblid, $lineno, \\@data, $udata );\n\nwarn( $tblid, $lineno, $message, $udata );\n\nnew( $tblid, $udata );\n"
                    },
                    {
                        "name": "Data Cleanup",
                        "content": "There are several cleanup operations that may be performed automatically:\n\nChomp   chomp() the data\n\nDecode  Run the data through HTML::Entities::decode.\n\nDecodeNBSP\nNormally HTML::Entitites::decode changes a non-breaking space into a character which\ndoesn't seem to be matched by Perl's whitespace regexp. Setting this attribute changes\nthe HTML \"nbsp\" character to a plain 'ol blank.\n\nTrim    remove leading and trailing white space.\n"
                    },
                    {
                        "name": "Data Organization",
                        "content": "Column names are derived from cells delimited by the <th> and </th> tags. Some tables have\nheader cells which span one or more columns or rows to make things look nice. HTML::TableParser\ndetermines the actual number of columns used and provides column names for each column,\nrepeating names for spanned columns and concatenating spanned rows and columns. For example, if\nthe table header looks like this:\n\n+----+--------+----------+-------------+-------------------+\n|    |        | Eq J2000 |             | Velocity/Redshift |\n| No | Object |----------| Object Type |-------------------|\n|    |        | RA | Dec |             | km/s |  z  | Qual |\n+----+--------+----------+-------------+-------------------+\n\nThe columns will be:\n\nNo\nObject\nEq J2000 RA\nEq J2000 Dec\nObject Type\nVelocity/Redshift km/s\nVelocity/Redshift z\nVelocity/Redshift Qual\n\nRow data are derived from cells delimited by the <td> and </td> tags. Cells which span more than\none column or row are handled correctly, i.e. the values are duplicated in the appropriate\nplaces.\n"
                    }
                ]
            },
            "METHODS": {
                "content": "new\n$p = HTML::TableParser->new( \\@reqs, \\%attr );\n\nThis is the class constructor. It is passed a list of table requests as well as\nattributes which specify defaults for common operations. Table requests are documented\nin \"Table Requests\".\n\nThe %attr hash provides default values for some of the table request attributes, namely\nthe data cleanup operations ( \"Chomp\", \"Decode\", \"Trim\" ), and the multi match attribute\n\"MultiMatch\", i.e.,\n\n$p = HTML::TableParser->new( \\@reqs, { Chomp => 1 } );\n\nwill set Chomp on for all of the table requests, unless overridden by them. The data\ncleanup operations are documented above; \"MultiMatch\" is documented in \"Table Requests\".\n\nDecode defaults to on; all of the others default to off.\n\nparsefile\nThis is the same function as in HTML::Parser.\n\nparse   This is the same function as in HTML::Parser.\n",
                "subsections": []
            },
            "Table Requests": {
                "content": "A table request is a hash used by HTML::TableParser to determine which tables are to be parsed,\nthe callbacks to be invoked, and any data cleanup. There may be multiple requests processed by\none call to the parser; each table is associated with a single request (even if several requests\nmatch the table).\n\nA single request may match several tables, however unless the MultiMatch attribute is specified\nfor that request, it will be used for the first matching table only.\n\nA table request which matches a table id of \"DEFAULT\" will be used as a catch-all request, and\nwill match all tables not matched by other requests. Please note that tables are compared to the\nrequests in the order that the latter are passed to the new() method; place the DEFAULT method\nlast for proper behavior.\n",
                "subsections": [
                    {
                        "name": "Identifying tables to parse",
                        "content": "HTML::TableParser needs to be told which tables to parse. This can be done by matching table ids\nor column names, or a combination of both. The table request hash elements dedicated to this\nare:\n\nid      This indicates a match on table id. It can take one of these forms:\n\nexact match\nid => $match\nid => '1.2'\n\nHere $match is a scalar which is compared directly to the table id.\n\nregular expression\nid => $re\nid => qr/1\\.\\d+\\.2/\n\n$re is a regular expression, which must be constructed with the \"qr//\" operator.\n\nsubroutine\nid => \\&mymatchsubroutine\nid => sub { my ( $id, $oids ) = @ ;\n$oids[0] > 3 && $oids[1] < 2 }\n\nHere \"id\" is assigned a coderef to a subroutine which returns true if the table\nmatches, false if not. The subroutine is passed two arguments: the table id as a\nscalar string ( e.g. 1.2.3) and the table id as an arrayref (e.g. \"$oids = [ 1,\n2, 3]\").\n\n\"id\" may be passed an array containing any combination of the above:\n\nid => [ '1.2', qr/1\\.\\d+\\.2/, sub { ... } ]\n\nElements in the array may be preceded by a modifier indicating the action to be taken if\nthe table matches on that element. The modifiers and their meanings are:\n\n\"-\"     If the id matches, it is explicitly excluded from being processed by this\nrequest.\n\n\"--\"    If the id matches, it is skipped by all requests.\n\n\"+\"     If the id matches, it will be processed by this request. This is the default\naction.\n\nAn example:\n\nid => [ '-', '1.2', 'DEFAULT' ]\n\nindicates that this request should be used for all tables, except for table 1.2.\n\nid => [ '--', '1.2' ]\n\nTable 2 is just plain skipped altogether.\n\ncols    This indicates a match on column names. It can take one of these forms:\n\nexact match\ncols => $match\ncols => 'Snacks01'\n\nHere $match is a scalar which is compared directly to the column names. If any\ncolumn matches, the table is processed.\n\nregular expression\ncols => $re\ncols => qr/Snacks\\d+/\n\n$re is a regular expression, which must be constructed with the \"qr//\" operator.\nAgain, a successful match against any column name causes the table to be\nprocessed.\n\nsubroutine\ncols => \\&mymatchsubroutine\ncols => sub { my ( $id, $oids, $cols ) = @ ;\n... }\n\nHere \"cols\" is assigned a coderef to a subroutine which returns true if the\ntable matches, false if not. The subroutine is passed three arguments: the table\nid as a scalar string ( e.g. 1.2.3), the table id as an arrayref (e.g. \"$oids =\n[ 1, 2, 3]\"), and the column names, as an arrayref (e.g. \"$cols = [ 'col1',\n'col2' ]\"). This option gives the calling routine the ability to make arbitrary\nselections based upon table id and columns.\n\n\"cols\" may be passed an arrayref containing any combination of the above:\n\ncols => [ 'Snacks01', qr/Snacks\\d+/, sub { ... } ]\n\nElements in the array may be preceded by a modifier indicating the action to be taken if\nthe table matches on that element. They are the same as the table id modifiers mentioned\nabove.\n\ncolre   This is deprecated, and is present for backwards compatibility only. An arrayref\ncontaining the regular expressions to match, or a scalar containing a single reqular\nexpression\n\nMore than one of these may be used for a single table request. A request may match more than one\ntable. By default a request is used only once (even the \"DEFAULT\" id match!). Set the\n\"MultiMatch\" attribute to enable multiple matches per request.\n\nWhen attempting to match a table, the following steps are taken:\n\n1       The table id is compared to the requests which contain an id match. The first such match\nis used (in the order given in the passed array).\n\n2       If no explicit id match is found, column name matches are attempted. The first such\nmatch is used (in the order given in the passed array)\n\n3       If no column name match is found (or there were none requested), the first request which\nmatches an id of \"DEFAULT\" is used.\n"
                    },
                    {
                        "name": "Specifying the data callbacks",
                        "content": "Callback functions are specified with the callback attributes \"start\", \"end\", \"hdr\", \"row\", and\n\"warn\". They should be set to code references, i.e.\n\n%tablereq = ( ..., start => \\&startfunc, end => \\&endfunc )\n\nTo use methods, specify the object with the \"obj\" key, and the method names via the callback\nattributes, which should be set to strings. If you don't specify method names they will default\nto (you guessed it) \"start\", \"end\", \"hdr\", \"row\", and \"warn\".\n\n$obj = SomeClass->new();\n# ...\n%tablereq1 = ( ..., obj => $obj );\n%tablereq2 = ( ..., obj => $obj, start => 'start',\nend => 'end' );\n\nYou can also have HTML::TableParser create a new object for you for each table by specifying the\n\"class\" attribute. By default the constructor is assumed to be the class new() method; if not,\nspecify it using the \"new\" attribute:\n\nuse MyClass;\n%tablereq = ( ..., class => 'MyClass', new => 'mynew' );\n\nTo use a function instead of a method for a particular callback, set the callback attribute to a\ncode reference:\n\n%tablereq = ( ..., obj => $obj, end => \\&endfunc );\n\nYou don't have to provide all the callbacks. You should not use both \"obj\" and \"class\" in the\nsame table request.\n\nHTML::TableParser automatically determines if your object or class has one of the required\nmethods. If you wish it *not* to use a particular method, set it equal to \"undef\". For example\n\n%tablereq = ( ..., obj => $obj, end => undef )\n\nindicates the object's end method should not be called, even if it exists.\n\nYou can specify arbitrary data to be passed to the callback functions via the \"udata\" attribute:\n\n%tablereq = ( ..., udata => \\%hashofmyspecialstuff )\n"
                    },
                    {
                        "name": "Specifying Data cleanup operations",
                        "content": "Data cleanup operations may be specified uniquely for each table. The available keys are\n\"Chomp\", \"Decode\", \"Trim\". They should be set to a non-zero value if the operation is to be\nperformed.\n"
                    },
                    {
                        "name": "Other Attributes",
                        "content": "The \"MultiMatch\" key is used when a request is capable of handling multiple tables in the\ndocument. Ordinarily, a request will process a single table only (even \"DEFAULT\" requests). Set\nit to a non-zero value to allow the request to handle more than one table.\n"
                    }
                ]
            },
            "BUGS": {
                "content": "Please report any bugs or feature requests on the bugtracker website\n<https://rt.cpan.org/Public/Dist/Display.html?Name=HTML-TableParser> or by email to\nbug-HTML-TableParser@rt.cpan.org <mailto:bug-HTML-TableParser@rt.cpan.org>.\n\nWhen submitting a bug or request, please include a test-file or a patch to an existing test-file\nthat illustrates the bug or desired feature.\n",
                "subsections": []
            },
            "SOURCE": {
                "content": "The development version is on github at <https://github.com/djerius/html-tableparser> and may be\ncloned from <git://github.com/djerius/html-tableparser.git>\n",
                "subsections": []
            },
            "AUTHOR": {
                "content": "Diab Jerius <djerius@cpan.org>\n",
                "subsections": []
            },
            "COPYRIGHT AND LICENSE": {
                "content": "This software is Copyright (c) 2018 by Smithsonian Astrophysical Observatory.\n\nThis is free software, licensed under:\n\nThe GNU General Public License, Version 3, June 2007\n",
                "subsections": []
            }
        }
    }
}