Unicode C
Unicode C
A Simple Web Crawler
See more Spider Examples
This demonstrates a very simple web crawler using the Chilkat Spider component.Chilkat Unicode C Downloads
#include <C_CkSpiderW.h>
#include <C_CkStringArrayW.h>
void ChilkatSample(void)
{
BOOL success;
HCkSpiderW spider;
HCkStringArrayW seenDomains;
HCkStringArrayW seedUrls;
const wchar_t *url;
const wchar_t *domain;
int i;
const wchar_t *domain;
const wchar_t *baseDomain;
success = FALSE;
spider = CkSpiderW_Create();
seenDomains = CkStringArrayW_Create();
seedUrls = CkStringArrayW_Create();
CkStringArrayW_putUnique(seenDomains,TRUE);
CkStringArrayW_putUnique(seedUrls,TRUE);
// You will need to change the start URL to something else...
CkStringArrayW_Append(seedUrls,L"http://something.whateverYouWant.com/");
// Set outbound URL exclude patterns
// URLs matching any of these patterns will not be added to the
// collection of outbound links.
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*?id=*");
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.mypages.*");
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.personal.*");
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.comcast.*");
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*.aol.*");
CkSpiderW_AddAvoidOutboundLinkPattern(spider,L"*~*");
// Use a cache so we don't have to re-fetch URLs previously fetched.
CkSpiderW_putCacheDir(spider,L"c:/spiderCache/");
CkSpiderW_putFetchFromCache(spider,TRUE);
CkSpiderW_putUpdateCache(spider,TRUE);
while (CkStringArrayW_getCount(seedUrls) > 0) {
url = CkStringArrayW_pop(seedUrls);
CkSpiderW_Initialize(spider,url);
// Spider 5 URLs of this domain.
// but first, save the base domain in seenDomains
domain = CkSpiderW_getUrlDomain(spider,url);
CkStringArrayW_Append(seenDomains,CkSpiderW_getBaseDomain(spider,domain));
for (i = 0; i <= 4; i++) {
success = CkSpiderW_CrawlNext(spider);
if (success == TRUE) {
// Display the URL we just crawled.
wprintf(L"%s\n",CkSpiderW_lastUrl(spider));
// If the last URL was retrieved from cache,
// we won't wait. Otherwise we'll wait 1 second
// before fetching the next URL.
if (CkSpiderW_getLastFromCache(spider) != TRUE) {
CkSpiderW_SleepMs(spider,1000);
}
}
else {
// cause the loop to exit..
i = 999;
}
}
// Add the outbound links to seedUrls, except
// for the domains we've already seen.
for (i = 0; i <= CkSpiderW_getNumOutboundLinks(spider) - 1; i++) {
url = CkSpiderW_getOutboundLink(spider,i);
domain = CkSpiderW_getUrlDomain(spider,url);
baseDomain = CkSpiderW_getBaseDomain(spider,domain);
if (CkStringArrayW_Contains(seenDomains,baseDomain) == FALSE) {
// Don't let our list of seedUrls grow too large.
if (CkStringArrayW_getCount(seedUrls) < 1000) {
CkStringArrayW_Append(seedUrls,url);
}
}
}
}
CkSpiderW_Dispose(spider);
CkStringArrayW_Dispose(seenDomains);
CkStringArrayW_Dispose(seedUrls);
}