Xojo Plugin
Xojo Plugin
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Xojo Plugin Downloads
Dim success As Boolean
success = False
Dim spider As New Chilkat.Spider
Dim seenDomains As New Chilkat.StringArray
Dim seedUrls As New Chilkat.StringArray
seenDomains.Unique = True
seedUrls.Unique = True
// You will need to change the start URL to something else...
success = seedUrls.Append("http://something.whateverYouWant.com/")
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
spider.AddAvoidOutboundLinkPattern "*?id=*"
spider.AddAvoidOutboundLinkPattern "*.mypages.*"
spider.AddAvoidOutboundLinkPattern "*.personal.*"
spider.AddAvoidOutboundLinkPattern "*.comcast.*"
spider.AddAvoidOutboundLinkPattern "*.aol.*"
spider.AddAvoidOutboundLinkPattern "*~*"
// Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir = "c:/spiderCache/"
spider.FetchFromCache = True
spider.UpdateCache = True
While seedUrls.Count > 0
Dim url As String
url = seedUrls.Pop()
spider.Initialize url
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
Dim domain As String
domain = spider.GetUrlDomain(url)
success = seenDomains.Append(spider.GetBaseDomain(domain))
Dim i As Int32
For i = 0 To 4
success = spider.CrawlNext()
If (success = True) Then
// Display the URL we just crawled.
System.DebugLog(spider.LastUrl)
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
If (spider.LastFromCache <> True) Then
spider.SleepMs 1000
End If
Else
// cause the loop to exit..
i = 999
End If
Next
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
For i = 0 To spider.NumOutboundLinks - 1
url = spider.GetOutboundLink(i)
Dim domain As String
domain = spider.GetUrlDomain(url)
Dim baseDomain As String
baseDomain = spider.GetBaseDomain(domain)
If (seenDomains.Contains(baseDomain) = False) Then
// Don't let our list of seedUrls grow too large.
If (seedUrls.Count < 1000) Then
success = seedUrls.Append(url)
End If
End If
Next
Wend