Perl
Perl
HTML Table to CSV
See more HTML-to-XML/Text Examples
Demonstrates a method for converting an HTML table to a CSV file.Note: This example requires Chilkat v9.5.0.77 or greater.
Chilkat Perl Downloads
use chilkat();
$success = 0;
# This example requires the Chilkat API to have been previously unlocked.
# See Global Unlock Sample for sample code.
# First download the HTML containing the table
$http = chilkat::CkHttp->new();
$bdHtml = chilkat::CkBinData->new();
$success = $http->QuickGetBd("https://example-code.com/data/etf_table.html",$bdHtml);
if ($success != 1) {
print $http->lastErrorText() . "\r\n";
exit;
}
# Convert to XML.
$htx = chilkat::CkHtmlToXml->new();
$htx->SetHtmlBd($bdHtml);
$sbXml = chilkat::CkStringBuilder->new();
$htx->ToXmlSb($sbXml);
$xml = chilkat::CkXml->new();
$xml->LoadSb($sbXml,1);
# Remove attributes and sub-trees we don't need.
# (In other words, we're getting rid of clutter...)
$numRemoved = $xml->PruneTag("thead");
$numRemoved = $xml->PruneAttribute("style");
$numRemoved = $xml->PruneAttribute("class");
# Scrub the element and attribute content.
$xml->Scrub("ContentTrimEnds,ContentTrimInside,AttrTrimEnds,AttrTrimInside");
# Let's see what we have...
print $xml->getXml() . "\r\n";
# We have the following XML.
# Copy this XML into the online tool at Generate Parsing Code from XML
# as a starting point for accessing the data..
# <?xml version="1.0" encoding="utf-8"?>
# <root>
# <html>
# <head>
# <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
# </head>
# <body text="#000000" bgcolor="#FFFFFF">
# <div>
# <div>
# <table role="grid" data-scrollx="true" data-sortdirection="desc" data-sorton="-1"/>
# </div>
# </div>
# <div>
# <table id="topHoldingsTable" role="grid" data-scrollx="true" data-sortdirection="desc" data-sorton="-1">
# <tbody>
# <tr role="row">
# <td>
# <text>ITUB4</text>
# </td>
# <td>
# <text>ITAU UNIBANCO HOLDING PREF SA</text>
# </td>
# <td>
# <text>Financials</text>
# </td>
# <td>
# <text>Brazil</text>
# </td>
# <td>
# <text>10.94</text>
# </td>
# <td>
# <text>998,954,813.73</text>
# </td>
# </tr>
# <tr role="row">
# <td>
# <text>BBDC4</text>
# </td>
# <td>
# <text>BANCO BRADESCO PREF SA</text>
# </td>
# <td>
# <text>Financials</text>
# </td>
# <td>
# <text>Brazil</text>
# </td>
# <td>
# <text>9.01</text>
# </td>
# <td>
# <text>822,164,622.75</text>
# </td>
# </tr>
# ...
# ...
# ...
# </tbody>
# </table>
# </div>
# </body>
# </html>
# </root>
#
# This is the code generated by the online tool:
#
$i = 0;
$count_i = $xml->NumChildrenHavingTag("html|body|div");
while ($i < $count_i) {
$xml->put_I($i);
$table_role = $xml->chilkatPath("html|body|div[i]|div|table|(role)");
$table_data_scrollx = $xml->chilkatPath("html|body|div[i]|div|table|(data-scrollx)");
$table_data_sortdirection = $xml->chilkatPath("html|body|div[i]|div|table|(data-sortdirection)");
$table_data_sorton = $xml->chilkatPath("html|body|div[i]|div|table|(data-sorton)");
$table_id = $xml->chilkatPath("html|body|div[i]|table|(id)");
$table_role = $xml->chilkatPath("html|body|div[i]|table|(role)");
$table_data_scrollx = $xml->chilkatPath("html|body|div[i]|table|(data-scrollx)");
$table_data_sortdirection = $xml->chilkatPath("html|body|div[i]|table|(data-sortdirection)");
$table_data_sorton = $xml->chilkatPath("html|body|div[i]|table|(data-sorton)");
$j = 0;
$count_j = $xml->NumChildrenHavingTag("html|body|div[i]|table|tbody|tr");
while ($j < $count_j) {
$xml->put_J($j);
$tr_role = $xml->chilkatPath("html|body|div[i]|table|tbody|tr[j]|(role)");
$k = 0;
$count_k = $xml->NumChildrenHavingTag("html|body|div[i]|table|tbody|tr[j]|td");
while ($k < $count_k) {
$xml->put_K($k);
$text = $xml->getChildContent("html|body|div[i]|table|tbody|tr[j]|td[k]|text");
$k = $k + 1;
}
$j = $j + 1;
}
$i = $i + 1;
}
# Let's modify the above code to build the CSV.
$csv = chilkat::CkCsv->new();
$csv->SetColumnName(0,"Ticker");
$csv->SetColumnName(1,"Name");
$csv->SetColumnName(2,"Sector");
$csv->SetColumnName(3,"Country");
$csv->SetColumnName(4,"Weight");
$csv->SetColumnName(5,"Notional Vaue");
$i = 0;
$count_i = $xml->NumChildrenHavingTag("html|body|div");
while ($i < $count_i) {
$xml->put_I($i);
$j = 0;
$count_j = $xml->NumChildrenHavingTag("html|body|div[i]|table|tbody|tr");
while ($j < $count_j) {
$xml->put_J($j);
$k = 0;
$count_k = $xml->NumChildrenHavingTag("html|body|div[i]|table|tbody|tr[j]|td");
while ($k < $count_k) {
$xml->put_K($k);
$csv->SetCell($j,$k,$xml->getChildContent("html|body|div[i]|table|tbody|tr[j]|td[k]|text"));
$k = $k + 1;
}
$j = $j + 1;
}
$i = $i + 1;
}
$csv->SaveFile("qa_output/brasil_etf.csv");
$csvStr = $csv->saveToString();
print $csvStr . "\r\n";
# Our CSV looks like this:
# Ticker,Name,Sector,Country,Weight,Notional Vaue
# ITUB4,ITAU UNIBANCO HOLDING PREF SA,Financials,Brazil,10.94,"998,954,813.73"
# BBDC4,BANCO BRADESCO PREF SA,Financials,Brazil,9.01,"822,164,622.75"
# VALE3,CIA VALE DO RIO DOCE SH,Materials,Brazil,8.60,"785,290,260.07"
# PETR4,PETROLEO BRASILEIRO PREF SA,Energy,Brazil,5.68,"518,124,434.10"
# PETR3,PETROBRAS,Energy,Brazil,4.86,"443,254,438.53"
# B3SA3,B3 BRASIL BOLSA BALCAO SA,Financials,Brazil,4.57,"417,636,740.16"
# ABEV3,AMBEV SA,Consumer Staples,Brazil,4.57,"417,216,913.63"
# BBAS3,BANCO DO BRASIL SA,Financials,Brazil,3.25,"296,921,232.15"
# ITSA4,ITAUSA INVESTIMENTOS ITAU PREF SA,Financials,Brazil,2.90,"265,153,684.52"
# LREN3,LOJAS RENNER SA,Consumer Discretionary,Brazil,2.25,"205,832,175.98"
#