Chilkat HOME ASP Visual Basic VB.NET C# Visual C++ C MFC Delphi FoxPro Java Perl PHP Python Ruby SQL Server VBScript
|
A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
LOCAL loSpider LOCAL loSeenDomains LOCAL loSeedUrls LOCAL lcUrl LOCAL lcDomain LOCAL i LOCAL lnSuccess LOCAL lcBaseDomain * The Chilkat Spider component/library is free. loSpider = CreateObject('Chilkat.Spider') loSeenDomains = CreateObject('Chilkat.CkStringArray') loSeedUrls = CreateObject('Chilkat.CkStringArray') loSeenDomains.Unique = 1 loSeedUrls.Unique = 1 loSeedUrls.Append("http://directory.google.com/Top/Recreation/Outdoors/Hiking/Backpacking/") * Set our outbound URL exclude patterns loSpider.AddAvoidOutboundLinkPattern("*?id=*") loSpider.AddAvoidOutboundLinkPattern("*.mypages.*") loSpider.AddAvoidOutboundLinkPattern("*.personal.*") loSpider.AddAvoidOutboundLinkPattern("*.comcast.*") loSpider.AddAvoidOutboundLinkPattern("*.aol.*") loSpider.AddAvoidOutboundLinkPattern("*~*") * Use a cache so we don't have to re-fetch URLs previously fetched. loSpider.CacheDir = "c:/spiderCache/" loSpider.FetchFromCache = 1 loSpider.UpdateCache = 1 DO WHILE loSeedUrls.Count > 0 lcUrl = loSeedUrls.Pop() loSpider.Initialize(lcUrl) * Spider 5 URLs of this domain. * but first, save the base domain in seenDomains lcDomain = loSpider.GetDomain(lcUrl) loSeenDomains.Append(loSpider.GetBaseDomain(lcDomain)) FOR i = 0 TO 4 lnSuccess = loSpider.CrawlNext() IF (lnSuccess <> 1) THEN EXIT ENDIF * Display the URL we just crawled. ? loSpider.LastUrl * If the last URL was retrieved from cache, * we won't wait. Otherwise we'll wait 1 second * before fetching the next URL. IF (loSpider.LastFromCache <> 1) THEN loSpider.SleepMs(1000) ENDIF NEXT * Add the outbound links to seedUrls, except * for the domains we've already seen. FOR i = 0 TO loSpider.NumOutboundLinks - 1 lcUrl = loSpider.GetOutboundLink(i) lcDomain = loSpider.GetDomain(lcUrl) lcBaseDomain = loSpider.GetBaseDomain(lcDomain) IF (NOT loSeenDomains.Contains(lcBaseDomain)) THEN loSeedUrls.Append(lcUrl) ENDIF * Don't let our list of seedUrls grow too large. IF (loSeedUrls.Count > 1000) THEN EXIT ENDIF NEXT ENDDO |
Need a specific example? Send a request to support@chilkatsoft.com
© 2000-2007 Chilkat Software, Inc. All Rights Reserved.
Mail Component · .NET Email Component · ASP Mail Component · XML Parser