PowerBuilder
PowerBuilder
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat PowerBuilder Downloads
integer li_rc
integer li_Success
oleobject loo_Spider
oleobject loo_SeenDomains
oleobject loo_SeedUrls
string ls_Url
string ls_Domain
integer i
string ls_Domain
string ls_BaseDomain
li_Success = 0
loo_Spider = create oleobject
li_rc = loo_Spider.ConnectToNewObject("Chilkat.Spider")
if li_rc < 0 then
destroy loo_Spider
MessageBox("Error","Connecting to COM object failed")
return
end if
loo_SeenDomains = create oleobject
li_rc = loo_SeenDomains.ConnectToNewObject("Chilkat.StringArray")
loo_SeedUrls = create oleobject
li_rc = loo_SeedUrls.ConnectToNewObject("Chilkat.StringArray")
loo_SeenDomains.Unique = 1
loo_SeedUrls.Unique = 1
// You will need to change the start URL to something else...
loo_SeedUrls.Append("http://something.whateverYouWant.com/")
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
loo_Spider.AddAvoidOutboundLinkPattern("*?id=*")
loo_Spider.AddAvoidOutboundLinkPattern("*.mypages.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.personal.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.comcast.*")
loo_Spider.AddAvoidOutboundLinkPattern("*.aol.*")
loo_Spider.AddAvoidOutboundLinkPattern("*~~*")
// Use a cache so we don't have to re-fetch URLs previously fetched.
loo_Spider.CacheDir = "c:/spiderCache/"
loo_Spider.FetchFromCache = 1
loo_Spider.UpdateCache = 1
do while loo_SeedUrls.Count > 0
ls_Url = loo_SeedUrls.Pop()
loo_Spider.Initialize(ls_Url)
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
ls_Domain = loo_Spider.GetUrlDomain(ls_Url)
loo_SeenDomains.Append(loo_Spider.GetBaseDomain(ls_Domain))
for i = 0 to 4
li_Success = loo_Spider.CrawlNext()
if li_Success = 1 then
// Display the URL we just crawled.
Write-Debug loo_Spider.LastUrl
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if loo_Spider.LastFromCache <> 1 then
loo_Spider.SleepMs(1000)
end if
else
// cause the loop to exit..
i = 999
end if
next
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for i = 0 to loo_Spider.NumOutboundLinks - 1
ls_Url = loo_Spider.GetOutboundLink(i)
ls_Domain = loo_Spider.GetUrlDomain(ls_Url)
ls_BaseDomain = loo_Spider.GetBaseDomain(ls_Domain)
if loo_SeenDomains.Contains(ls_BaseDomain) = 0 then
// Don't let our list of seedUrls grow too large.
if loo_SeedUrls.Count < 1000 then
loo_SeedUrls.Append(ls_Url)
end if
end if
next
loop
destroy loo_Spider
destroy loo_SeenDomains
destroy loo_SeedUrls