Chilkat HOME ASP Visual Basic VB.NET C# Visual C++ C MFC Delphi FoxPro Java Perl PHP Python Ruby SQL Server VBScript
|
A Simple Web CrawlerThis demonstrates a very simple web crawler using the Chilkat Spider component.
uses Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, Dialogs, StdCtrls, SPIDERXLib_TLB, CHILKATUTILLib_TLB, SPIDERXLib_TLB, OleCtrls; ... procedure TForm1.Button1Click(Sender: TObject); var spider: TSpider; seenDomains: CHILKATUTILLib_TLB.ICkStringArray; seedUrls: CHILKATUTILLib_TLB.ICkStringArray; url: String; domain: String; i: Integer; success: Integer; baseDomain: String; begin // The Chilkat Spider component/library is free. spider := TSpider.Create(Self); seenDomains := TCkStringArray.Create(Self).ControlInterface; seedUrls := TCkStringArray.Create(Self).ControlInterface; seenDomains.Unique := 1; seedUrls.Unique := 1; seedUrls.Append('http://directory.google.com/Top/Recreation/Outdoors/Hiking/Backpacking/'); // Set our outbound URL exclude patterns spider.AddAvoidOutboundLinkPattern('*?id=*'); spider.AddAvoidOutboundLinkPattern('*.mypages.*'); spider.AddAvoidOutboundLinkPattern('*.personal.*'); spider.AddAvoidOutboundLinkPattern('*.comcast.*'); spider.AddAvoidOutboundLinkPattern('*.aol.*'); spider.AddAvoidOutboundLinkPattern('*~*'); // Use a cache so we don't have to re-fetch URLs previously fetched. spider.CacheDir := 'c:/spiderCache/'; spider.FetchFromCache := 1; spider.UpdateCache := 1; while seedUrls.Count > 0 do begin url := seedUrls.Pop(); spider.Initialize(url); // Spider 5 URLs of this domain. // but first, save the base domain in seenDomains domain := spider.GetDomain(url); seenDomains.Append(spider.GetBaseDomain(domain)); for i := 0 to 4 do begin success := spider.CrawlNext(); if (success <> 1) then begin break; end; // Display the URL we just crawled. Memo1.Lines.Add(spider.LastUrl); // If the last URL was retrieved from cache, // we won't wait. Otherwise we'll wait 1 second // before fetching the next URL. if (spider.LastFromCache <> 1) then begin spider.SleepMs(1000); end; end; // Add the outbound links to seedUrls, except // for the domains we've already seen. for i := 0 to spider.NumOutboundLinks - 1 do begin url := spider.GetOutboundLink(i); domain := spider.GetDomain(url); baseDomain := spider.GetBaseDomain(domain); if (not seenDomains.Contains(baseDomain)) then begin seedUrls.Append(url); end; // Don't let our list of seedUrls grow too large. if (seedUrls.Count > 1000) then begin break; end; end; end; end; |
Need a specific example? Send a request to support@chilkatsoft.com
© 2000-2007 Chilkat Software, Inc. All Rights Reserved.