Chilkat HOME ASP Visual Basic VB.NET C# Visual C++ C MFC Delphi FoxPro Java Perl PHP Python Ruby SQL Server VBScript
|
Web Page AnalyzerAnalyze the contents of a web page. Gets the HTML title, META description, META keywords, visible text, H1 tags, H2 tags, etc.
Chilkat.Http http = new Chilkat.Http(); bool success; // Any string unlocks the component for the 1st 30-days. success = http.UnlockComponent("Anything for 30-day trial"); if (success != true) { MessageBox.Show(http.LastErrorText); return; } string url1; url1 = "http://www.marriott.com/hotels/travel/curmc-curacao-marriott-beach-resort-and-emerald-casino/"; string url2; url2 = "http://www.techcrunch.com/2007/07/10/dailymotion-comes-to-us-pays-users/"; // Send the HTTP GET and return the content in a string. string html; html = http.QuickGetStr(url2); Chilkat.HtmlUtil htmlUtil = new Chilkat.HtmlUtil(); // Decode any HTML entities. html = htmlUtil.EntityDecode(html); // Display the HTML title tag: textBox1.Text += "---- Title:" + "\r\n"; textBox1.Refresh(); textBox1.Text += htmlUtil.GetTitle(html) + "\r\n"; textBox1.Refresh(); // Display the META Description: textBox1.Text += "---- Description:" + "\r\n"; textBox1.Refresh(); textBox1.Text += htmlUtil.GetDescription(html) + "\r\n"; textBox1.Refresh(); // Display the META Keywords: textBox1.Text += "---- Keywords:" + "\r\n"; textBox1.Refresh(); textBox1.Text += htmlUtil.GetKeywords(html) + "\r\n"; textBox1.Refresh(); // Display the 10 most common non-stopwords found in the HTML text. // (Stopwords are words such as "if", "the", "or", "what", etc.) bool bExcludeTitle; bExcludeTitle = true; bool bExcludeDescrip; bExcludeDescrip = true; bool bExcludeKeywords; bExcludeKeywords = true; textBox1.Text += "---- Word Vector:" + "\r\n"; textBox1.Refresh(); textBox1.Text += htmlUtil.GetWordVector(html,10,bExcludeTitle,bExcludeDescrip,bExcludeKeywords) + "\r\n"; textBox1.Refresh(); // Convert the HTML to XML: Chilkat.HtmlToXml htmlToXml = new Chilkat.HtmlToXml(); success = htmlToXml.UnlockComponent("Anything for 30-day trial"); if (success != true) { MessageBox.Show(htmlToXml.LastErrorText); return; } // Calling DropTextFormattingTags causes // all text formatting tags (font, b, i, etc.) to be dropped // in the conversion process. htmlToXml.DropTextFormattingTags(); string xHtml; htmlToXml.Html = html; xHtml = htmlToXml.ToXml(); Chilkat.Xml xml = new Chilkat.Xml(); xml.LoadXml(xHtml); // The AccumulateTagContent method accumulates the content // of all nodes having a specific tag. The XML created by // HtmlToXml places all text under "text" nodes. // The 2nd argument to AccumulateTagContent indicates the // tags for nodes to be skipped. If more than one // tag is to be skipped, separate them with vertical bar // characters, such as "script|h1|h2". string visibleText; visibleText = xml.AccumulateTagContent("text","script|style"); textBox1.Text += "---- Visible Text:" + "\r\n"; textBox1.Refresh(); textBox1.Text += visibleText + "\r\n"; textBox1.Refresh(); // Iterate over all h1 tags: // (Note: HtmlToXml will produce XML with lowercase tags.) Chilkat.Xml xNode; Chilkat.Xml xBeginSearchAfter; string h1Text; xBeginSearchAfter = null; // Find the 1st H1 tag. xNode = xml.SearchForTag(xBeginSearchAfter,"h1"); while (!(xNode == null )) { // Any given node may have child nodes, so accumulate the content: h1Text = xNode.AccumulateTagContent("text","script|style"); textBox1.Text += "---- H1:" + "\r\n"; textBox1.Refresh(); textBox1.Text += h1Text + "\r\n"; textBox1.Refresh(); xBeginSearchAfter = xNode; xNode = xml.SearchForTag(xBeginSearchAfter,"h1"); } |
Need a specific example? Send a request to support@chilkatsoft.com
© 2000-2008 Chilkat Software, Inc. All Rights Reserved.