Sample code for 30+ languages & platforms
Xojo Plugin

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Xojo Plugin Downloads

Xojo Plugin
Dim success As Boolean
success = False

Dim spider As New Chilkat.Spider

Dim seenDomains As New Chilkat.StringArray
Dim seedUrls As New Chilkat.StringArray

seenDomains.Unique = True
seedUrls.Unique = True

// You will need to change the start URL to something else...
success = seedUrls.Append("http://something.whateverYouWant.com/")

// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the 
// collection of outbound links.
spider.AddAvoidOutboundLinkPattern "*?id=*"
spider.AddAvoidOutboundLinkPattern "*.mypages.*"
spider.AddAvoidOutboundLinkPattern "*.personal.*"
spider.AddAvoidOutboundLinkPattern "*.comcast.*"
spider.AddAvoidOutboundLinkPattern "*.aol.*"
spider.AddAvoidOutboundLinkPattern "*~*"

// Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir = "c:/spiderCache/"
spider.FetchFromCache = True
spider.UpdateCache = True

While seedUrls.Count > 0

    Dim url As String
    url = seedUrls.Pop()
    spider.Initialize url

    // Spider 5 URLs of this domain.
    // but first, save the base domain in seenDomains
    Dim domain As String
    domain = spider.GetUrlDomain(url)
    success = seenDomains.Append(spider.GetBaseDomain(domain))

    Dim i As Int32

    For i = 0 To 4
        success = spider.CrawlNext()
        If (success = True) Then

            // Display the URL we just crawled.
            System.DebugLog(spider.LastUrl)

            // If the last URL was retrieved from cache,
            // we won't wait.  Otherwise we'll wait 1 second
            // before fetching the next URL.
            If (spider.LastFromCache <> True) Then
                spider.SleepMs 1000
            End If

        Else
            // cause the loop to exit..
            i = 999
        End If

    Next

    // Add the outbound links to seedUrls, except
    // for the domains we've already seen.
    For i = 0 To spider.NumOutboundLinks - 1

        url = spider.GetOutboundLink(i)
        Dim domain As String
        domain = spider.GetUrlDomain(url)
        Dim baseDomain As String
        baseDomain = spider.GetBaseDomain(domain)
        If (seenDomains.Contains(baseDomain) = False) Then
            // Don't let our list of seedUrls grow too large.
            If (seedUrls.Count < 1000) Then
                success = seedUrls.Append(url)
            End If

        End If

    Next

Wend