diff --git a/CHANGELOG.md b/CHANGELOG.md index f0b8b205f8005b994aecc6556a476dbca0d49b58..11b7a2ddc07faa6c0a966a653f47fcaae319839a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Emit `newpage` event. +- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options. + +### changed + +- Allow `allowedDomains` option to accept a list of regular expressions. ## [1.3.2] - 2018-01-19 diff --git a/README.md b/README.md index a9e8d9b69d9bf4e1f1391d604f1f7d48fd328c6f..590a800b84b77b772cbd20fba297ac6fb3a8b7cf 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed. ``` -url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage +url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage ``` > **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity. @@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed. ``` -url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage +url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage ``` > **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity. @@ -242,7 +242,8 @@ url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, d * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`. * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`. - * `allowedDomains` <[Array]<[string]>> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed. + * `allowedDomains` <[Array]<[string]|[RegExp]>> List of domains allowed to request. Pass `null` or leave default to skip checking allowed domain + * `deniedDomains` <[Array]<[string]|[RegExp]>> List of domains not allowed to request. Pass `null` or leave default to skip checking denied domain. * `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, `maxConcurrency` option must be `1`. * `retryCount` <[number]> Number of limit when retry fails, defaults to `3`. * `retryDelay` <[number]> Number of milliseconds after each retry fails, defaults to `10000`. @@ -548,6 +549,7 @@ Dynamic crawlers based on [PhantomJS](http://phantomjs.org) and [Selenium](http: [Object]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object "Object" [Promise]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise "Promise" [string]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#String_type "String" +[RegExp]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp "RegExp" [Serializable]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify#Description "Serializable" [Error]: https://nodejs.org/api/errors.html#errors_class_error "Error" [HCCrawler]: #class-hccrawler "HCCrawler" diff --git a/lib/hccrawler.js b/lib/hccrawler.js index d47a2421d6494b11a082ffbce521c1840bedc601..857e6dd3b3553bf7f209f3c990e176b8254738db 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -7,8 +7,6 @@ const { map, each, includes, - some, - endsWith, isString, isArray, } = require('lodash'); @@ -19,6 +17,7 @@ const devices = require('puppeteer/DeviceDescriptors'); const { delay, generateKey, + checkDomainMatch, getRobotsUrl, getSitemapUrls, tracePublicAPI, @@ -452,8 +451,9 @@ class HCCrawler extends EventEmitter { */ _checkAllowedDomains(options) { const { hostname } = parse(options.url); - if (!options.allowedDomains) return true; - return some(options.allowedDomains, domain => endsWith(hostname, domain)); + if (options.deniedDomains && checkDomainMatch(options.deniedDomains, hostname)) return false; + if (options.allowedDomains && !checkDomainMatch(options.allowedDomains, hostname)) return false; + return true; } /** diff --git a/lib/helper.js b/lib/helper.js index f5095786a2ec037003e752a0ed5060ea8f37aecb..79d40b57f02b8d27c601b541db481383005f77c3 100644 --- a/lib/helper.js +++ b/lib/helper.js @@ -3,12 +3,14 @@ const { parse, resolve, format } = require('url'); const crypto = require('crypto'); const { pick, - isPlainObject, trim, startsWith, + some, includes, + isPlainObject, isString, isFunction, + isRegExp, } = require('lodash'); const debug = require('debug'); @@ -121,6 +123,18 @@ class Helper { return first; } + /** + * @param {!Array<!string|RegExp} domains + * @param {!string} hostname + * @return {!boolean} + */ + static checkDomainMatch(domains, hostname) { + return some(domains, domain => { + if (isRegExp(domain)) return domain.test(hostname); + return domain === hostname; + }); + } + /** * @param {!string} sitemapXml * @return {!Array<!string>} diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js index 428950ff1c894b2a8afbf9ed3d3bf096a82a21f1..ad6828a4b58fde7342eb16d2d4c5371d1ead4bf5 100644 --- a/test/hccrawler.test.js +++ b/test/hccrawler.test.js @@ -191,7 +191,7 @@ describe('HCCrawler', () => { }); }); - it('crawls when the requested domain is allowed', () => { + it('crawls when the requested domain exactly matches allowed domain', () => { let requestskipped = 0; crawler.on('requestskipped', () => { requestskipped += 1; }); crawler.queue({ url: INDEX_PAGE, allowedDomains: ['127.0.0.1'] }); @@ -202,7 +202,18 @@ describe('HCCrawler', () => { }); }); - it('skips crawling when the requested domain is not allowed', () => { + it('crawls when the requested domain matches allowed domain by regular expression', () => { + let requestskipped = 0; + crawler.on('requestskipped', () => { requestskipped += 1; }); + crawler.queue({ url: INDEX_PAGE, allowedDomains: [/\d+\.\d+\.\d+\.\d+/] }); + return crawler.onIdle() + .then(() => { + assert.equal(requestskipped, 0); + assert.equal(onSuccess.callCount, 1); + }); + }); + + it('skips crawling when the requested domain does not match allowed domain', () => { let requestskipped = 0; crawler.on('requestskipped', () => { requestskipped += 1; }); crawler.queue({ url: INDEX_PAGE, allowedDomains: ['0.0.0.0'] }); @@ -213,6 +224,28 @@ describe('HCCrawler', () => { }); }); + it('skips crawling when the requested domain exactly matches denied domain', () => { + let requestskipped = 0; + crawler.on('requestskipped', () => { requestskipped += 1; }); + crawler.queue({ url: INDEX_PAGE, deniedDomains: ['127.0.0.1'] }); + return crawler.onIdle() + .then(() => { + assert.equal(requestskipped, 1); + assert.equal(onSuccess.callCount, 0); + }); + }); + + it('skips crawling when the requested domain matches denied domain by regular expression', () => { + let requestskipped = 0; + crawler.on('requestskipped', () => { requestskipped += 1; }); + crawler.queue({ url: INDEX_PAGE, deniedDomains: [/\d+\.\d+\.\d+\.\d+/] }); + return crawler.onIdle() + .then(() => { + assert.equal(requestskipped, 1); + assert.equal(onSuccess.callCount, 0); + }); + }); + it('follows links when maxDepth is set', () => { let maxdepthreached = 0; server.setContent('/1.html', `go to <a href="${PREFIX}/2.html">/2.html</a>`);