Sample code for 30+ languages & platforms
PowerBuilder

A Simple Web Crawler

See more Spider Examples

This demonstrates a very simple web crawler using the Chilkat Spider component.

Chilkat PowerBuilder Downloads

PowerBuilder
integer li_rc
integer li_Success
oleobject loo_Spider
oleobject loo_SeenDomains
oleobject loo_SeedUrls
string ls_Url
string ls_Domain
integer i
string ls_Domain
string ls_BaseDomain

li_Success = 0

loo_Spider = create oleobject
li_rc = loo_Spider.ConnectToNewObject("Chilkat.Spider")
if li_rc < 0 then
    destroy loo_Spider
    MessageBox("Error","Connecting to COM object failed")
    return
end if

loo_SeenDomains = create oleobject
li_rc = loo_SeenDomains.ConnectToNewObject("Chilkat.StringArray")

loo_SeedUrls = create oleobject
li_rc = loo_SeedUrls.ConnectToNewObject("Chilkat.StringArray")

loo_SeenDomains.Unique = 1
loo_SeedUrls.Unique = 1

// You will need to change the start URL to something else...
loo_SeedUrls.Append("http://something.whateverYouWant.com/")

// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the 
// collection of outbound links.
loo_Spider.AddAvoidOutboundLinkPattern("*?id=*")
loo_Spider.AddAvoidOutboundLinkPattern("*.mypages.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.personal.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.comcast.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.aol.*")
loo_Spider.AddAvoidOutboundLinkPattern("*~~*")

// Use a cache so we don't have to re-fetch URLs previously fetched.
loo_Spider.CacheDir = "c:/spiderCache/"
loo_Spider.FetchFromCache = 1
loo_Spider.UpdateCache = 1

do while loo_SeedUrls.Count > 0

    ls_Url = loo_SeedUrls.Pop()
    loo_Spider.Initialize(ls_Url)

    // Spider 5 URLs of this domain.
    // but first, save the base domain in seenDomains
    ls_Domain = loo_Spider.GetUrlDomain(ls_Url)
    loo_SeenDomains.Append(loo_Spider.GetBaseDomain(ls_Domain))

    for i = 0 to 4
        li_Success = loo_Spider.CrawlNext()
        if li_Success = 1 then

            // Display the URL we just crawled.
            Write-Debug loo_Spider.LastUrl

            // If the last URL was retrieved from cache,
            // we won't wait.  Otherwise we'll wait 1 second
            // before fetching the next URL.
            if loo_Spider.LastFromCache <> 1 then
                loo_Spider.SleepMs(1000)
            end if

        else
            // cause the loop to exit..
            i = 999
        end if

    next

    // Add the outbound links to seedUrls, except
    // for the domains we've already seen.
    for i = 0 to loo_Spider.NumOutboundLinks - 1

        ls_Url = loo_Spider.GetOutboundLink(i)
        ls_Domain = loo_Spider.GetUrlDomain(ls_Url)
        ls_BaseDomain = loo_Spider.GetBaseDomain(ls_Domain)
        if loo_SeenDomains.Contains(ls_BaseDomain) = 0 then
            // Don't let our list of seedUrls grow too large.
            if loo_SeedUrls.Count < 1000 then
                loo_SeedUrls.Append(ls_Url)
            end if

        end if

    next

loop


destroy loo_Spider
destroy loo_SeenDomains
destroy loo_SeedUrls