Chilkat HOME ASP Visual Basic VB.NET C# Visual C++ C MFC Delphi FoxPro Java Perl PHP Python Ruby SQL Server VBScript
|
Web Page AnalyzerAnalyze the contents of a web page. Gets the HTML title, META description, META keywords, visible text, H1 tags, H2 tags, etc.
Dim http As New Chilkat.Http() Dim success As Boolean ' Any string unlocks the component for the 1st 30-days. success = http.UnlockComponent("Anything for 30-day trial") If (success <> true) Then MsgBox(http.LastErrorText) Exit Sub End If Dim url1 As String url1 = "http://www.marriott.com/hotels/travel/curmc-curacao-marriott-beach-resort-and-emerald-casino/" Dim url2 As String url2 = "http://www.techcrunch.com/2007/07/10/dailymotion-comes-to-us-pays-users/" ' Send the HTTP GET and return the content in a string. Dim html As String html = http.QuickGetStr(url2) Dim htmlUtil As New Chilkat.HtmlUtil() ' Decode any HTML entities. html = htmlUtil.EntityDecode(html) ' Display the HTML title tag: TextBox1.Text = TextBox1.Text & "---- Title:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & htmlUtil.GetTitle(html) & vbCrLf TextBox1.Refresh() ' Display the META Description: TextBox1.Text = TextBox1.Text & "---- Description:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & htmlUtil.GetDescription(html) & vbCrLf TextBox1.Refresh() ' Display the META Keywords: TextBox1.Text = TextBox1.Text & "---- Keywords:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & htmlUtil.GetKeywords(html) & vbCrLf TextBox1.Refresh() ' Display the 10 most common non-stopwords found in the HTML text. ' (Stopwords are words such as "if", "the", "or", "what", etc.) Dim bExcludeTitle As Boolean bExcludeTitle = true Dim bExcludeDescrip As Boolean bExcludeDescrip = true Dim bExcludeKeywords As Boolean bExcludeKeywords = true TextBox1.Text = TextBox1.Text & "---- Word Vector:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & htmlUtil.GetWordVector(html,10,bExcludeTitle,bExcludeDescrip,bExcludeKeywords) & vbCrLf TextBox1.Refresh() ' Convert the HTML to XML: Dim htmlToXml As New Chilkat.HtmlToXml() success = htmlToXml.UnlockComponent("Anything for 30-day trial") If (success <> true) Then MsgBox(htmlToXml.LastErrorText) Exit Sub End If ' Calling DropTextFormattingTags causes ' all text formatting tags (font, b, i, etc.) to be dropped ' in the conversion process. htmlToXml.DropTextFormattingTags() Dim xHtml As String htmlToXml.Html = html xHtml = htmlToXml.ToXml() Dim xml As New Chilkat.Xml() xml.LoadXml(xHtml) ' The AccumulateTagContent method accumulates the content ' of all nodes having a specific tag. The XML created by ' HtmlToXml places all text under "text" nodes. ' The 2nd argument to AccumulateTagContent indicates the ' tags for nodes to be skipped. If more than one ' tag is to be skipped, separate them with vertical bar ' characters, such as "script|h1|h2". Dim visibleText As String visibleText = xml.AccumulateTagContent("text","script|style") TextBox1.Text = TextBox1.Text & "---- Visible Text:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & visibleText & vbCrLf TextBox1.Refresh() ' Iterate over all h1 tags: ' (Note: HtmlToXml will produce XML with lowercase tags.) Dim xNode As Chilkat.Xml Dim xBeginSearchAfter As Chilkat.Xml Dim h1Text As String xBeginSearchAfter = Nothing ' Find the 1st H1 tag. xNode = xml.SearchForTag(xBeginSearchAfter,"h1") While Not (xNode Is Nothing ) ' Any given node may have child nodes, so accumulate the content: h1Text = xNode.AccumulateTagContent("text","script|style") TextBox1.Text = TextBox1.Text & "---- H1:" & vbCrLf TextBox1.Refresh() TextBox1.Text = TextBox1.Text & h1Text & vbCrLf TextBox1.Refresh() xBeginSearchAfter = xNode xNode = xml.SearchForTag(xBeginSearchAfter,"h1") End While |
Need a specific example? Send a request to support@chilkatsoft.com
© 2000-2007 Chilkat Software, Inc. All Rights Reserved.