Sample code for 30+ languages & platforms
Visual Basic 6.0

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat Visual Basic 6.0 Downloads

Visual Basic 6.0
Dim success As Long
success = 0

Dim spider As New ChilkatSpider

Dim seenDomains As New CkStringArray
Dim seedUrls As New CkStringArray

seenDomains.Unique = 1
seedUrls.Unique = 1

' You will need to change the start URL to something else...
success = seedUrls.Append("http://something.whateverYouWant.com/")

' Set outbound URL exclude patterns
' URLs matching any of these patterns will not be added to the 
' collection of outbound links.
spider.AddAvoidOutboundLinkPattern "*?id=*"
spider.AddAvoidOutboundLinkPattern "*.mypages.*"
spider.AddAvoidOutboundLinkPattern "*.personal.*"
spider.AddAvoidOutboundLinkPattern "*.comcast.*"
spider.AddAvoidOutboundLinkPattern "*.aol.*"
spider.AddAvoidOutboundLinkPattern "*~*"

' Use a cache so we don't have to re-fetch URLs previously fetched.
spider.CacheDir = "c:/spiderCache/"
spider.FetchFromCache = 1
spider.UpdateCache = 1

Do While seedUrls.Count > 0

    Dim url As String
    url = seedUrls.Pop()
    spider.Initialize url

    ' Spider 5 URLs of this domain.
    ' but first, save the base domain in seenDomains
    Dim domain As String
    domain = spider.GetUrlDomain(url)
    success = seenDomains.Append(spider.GetBaseDomain(domain))

    Dim i As Long

    For i = 0 To 4
        success = spider.CrawlNext()
        If (success = 1) Then

            ' Display the URL we just crawled.
            Debug.Print spider.LastUrl

            ' If the last URL was retrieved from cache,
            ' we won't wait.  Otherwise we'll wait 1 second
            ' before fetching the next URL.
            If (spider.LastFromCache <> 1) Then
                spider.SleepMs 1000
            End If

        Else
            ' cause the loop to exit..
            i = 999
        End If

    Next

    ' Add the outbound links to seedUrls, except
    ' for the domains we've already seen.
    For i = 0 To spider.NumOutboundLinks - 1

        url = spider.GetOutboundLink(i)
        Dim domain As String
        domain = spider.GetUrlDomain(url)
        Dim baseDomain As String
        baseDomain = spider.GetBaseDomain(domain)
        If (seenDomains.Contains(baseDomain) = 0) Then
            ' Don't let our list of seedUrls grow too large.
            If (seedUrls.Count < 1000) Then
                success = seedUrls.Append(url)
            End If

        End If

    Next

Loop