From 79b6e79f9e272c06de3e44fe16f70830551eb950 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:19:43 +0200 Subject: [PATCH 1/6] chore(deps): bump axios from 1.13.2 to 1.15.0 (#3564) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [axios](https://github.com/axios/axios) from 1.13.2 to 1.15.0.
Release notes

Sourced from axios's releases.

v1.15.0

This release delivers two critical security patches, adds runtime support for Deno and Bun, and includes significant CI hardening, documentation improvements, and routine dependency updates.

⚠️ Important Changes

🔒 Security Fixes

🚀 New Features

🔧 Maintenance & Chores

🌟 New Contributors

We are thrilled to welcome our new contributors. Thank you for helping improve Axios:

v1.14.0

This release focuses on compatibility fixes, adapter stability improvements, and test/tooling modernisation.

⚠️ Important Changes

🚀 New Features

🐛 Bug Fixes

... (truncated)

Changelog

Sourced from axios's changelog.

Changelog

1.13.3 (2026-01-20)

Bug Fixes

Features

Reverts

Contributors to this release

... (truncated)

Commits
Maintainer changes

This version was pushed to npm by [GitHub Actions](https://www.npmjs.com/~GitHub Actions), a new releaser for axios since your current version.

Install script changes

This version modifies prepare script that runs during installation. Review the package contents before updating.


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=axios&package-manager=npm_and_yarn&previous-version=1.13.2&new-version=1.15.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apify/crawlee/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- yarn.lock | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/yarn.lock b/yarn.lock index 1144d78c311e..d1a146e59337 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4634,13 +4634,13 @@ __metadata: linkType: hard "axios@npm:^1.12.0, axios@npm:^1.6.7": - version: 1.13.2 - resolution: "axios@npm:1.13.2" + version: 1.15.0 + resolution: "axios@npm:1.15.0" dependencies: - follow-redirects: "npm:^1.15.6" - form-data: "npm:^4.0.4" - proxy-from-env: "npm:^1.1.0" - checksum: 10c0/e8a42e37e5568ae9c7a28c348db0e8cf3e43d06fcbef73f0048669edfe4f71219664da7b6cc991b0c0f01c28a48f037c515263cb79be1f1ae8ff034cd813867b + follow-redirects: "npm:^1.15.11" + form-data: "npm:^4.0.5" + proxy-from-env: "npm:^2.1.0" + checksum: 10c0/47e0f860e98d4d7aa145e89ce0cae00e1fb0f1d2485f065c21fce955ddb1dba4103a46bd0e47acd18a27208a7f62c96249e620db575521b92a968619ab133409 languageName: node linkType: hard @@ -7653,7 +7653,7 @@ __metadata: languageName: node linkType: hard -"follow-redirects@npm:^1.15.6": +"follow-redirects@npm:^1.15.11": version: 1.15.11 resolution: "follow-redirects@npm:1.15.11" peerDependenciesMeta: @@ -7716,6 +7716,19 @@ __metadata: languageName: node linkType: hard +"form-data@npm:^4.0.5": + version: 4.0.5 + resolution: "form-data@npm:4.0.5" + dependencies: + asynckit: "npm:^0.4.0" + combined-stream: "npm:^1.0.8" + es-set-tostringtag: "npm:^2.1.0" + hasown: "npm:^2.0.2" + mime-types: "npm:^2.1.12" + checksum: 10c0/dd6b767ee0bbd6d84039db12a0fa5a2028160ffbfaba1800695713b46ae974a5f6e08b3356c3195137f8530dcd9dfcb5d5ae1eeff53d0db1e5aad863b619ce3b + languageName: node + linkType: hard + "formdata-node@npm:^4.3.2": version: 4.4.1 resolution: "formdata-node@npm:4.4.1" @@ -12676,6 +12689,13 @@ __metadata: languageName: node linkType: hard +"proxy-from-env@npm:^2.1.0": + version: 2.1.0 + resolution: "proxy-from-env@npm:2.1.0" + checksum: 10c0/ed01729fd4d094eab619cd7e17ce3698b3413b31eb102c4904f9875e677cd207392795d5b4adee9cec359dfd31c44d5ad7595a3a3ad51c40250e141512281c58 + languageName: node + linkType: hard + "proxy@npm:^1.0.2": version: 1.0.2 resolution: "proxy@npm:1.0.2" From 88fab22403a602d5005e62ce5d56e1619a83f7df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:19:51 +0200 Subject: [PATCH 2/6] chore(deps): bump axios from 1.13.5 to 1.15.0 in /website (#3565) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [axios](https://github.com/axios/axios) from 1.13.5 to 1.15.0.
Release notes

Sourced from axios's releases.

v1.15.0

This release delivers two critical security patches, adds runtime support for Deno and Bun, and includes significant CI hardening, documentation improvements, and routine dependency updates.

⚠️ Important Changes

  • Deprecation: url.parse() usage has been replaced to address Node.js deprecation warnings. If you are on a recent version of Node.js, this resolves console warnings you may have been seeing. (#10625)

🔒 Security Fixes

  • Proxy Handling: Fixed a no_proxy hostname normalisation bypass that could lead to Server-Side Request Forgery (SSRF). (#10661)
  • Header Injection: Fixed an unrestricted cloud metadata exfiltration vulnerability via a header injection chain. (#10660)

🚀 New Features

  • Runtime Support: Added compatibility checks and documentation for Deno and Bun environments. (#10652, #10653)

🔧 Maintenance & Chores

  • CI Security: Hardened workflow permissions to least privilege, added the zizmor security scanner, pinned action versions, and gated npm publishing with OIDC and environment protection. (#10618, #10619, #10627, #10637, #10666)
  • Dependencies: Bumped serialize-javascript, handlebars, picomatch, vite, and denoland/setup-deno to latest versions. Added a 7-day Dependabot cooldown period. (#10574, #10572, #10568, #10663, #10664, #10665, #10669, #10670, #10616)
  • Documentation: Unified docs, improved beforeRedirect credential leakage example, clarified withCredentials/withXSRFToken behaviour, HTTP/2 support notes, async/await timeout error handling, header case preservation, and various typo fixes. (#10649, #10624, #7452, #7471, #10654, #10644, #10589)
  • Housekeeping: Removed stale files, regenerated lockfile, and updated sponsor scripts and blocks. (#10584, #10650, #10582, #10640, #10659, #10668)
  • Tests: Added regression coverage for urlencoded Content-Type casing. (#10573)

🌟 New Contributors

We are thrilled to welcome our new contributors. Thank you for helping improve Axios:

v1.14.0

This release focuses on compatibility fixes, adapter stability improvements, and test/tooling modernisation.

⚠️ Important Changes

  • Breaking Changes: None identified in this release.
  • Action Required: If you rely on env-based proxy behaviour or CJS resolution edge-cases, validate your integration after upgrade (notably proxy-from-env v2 alignment and main entry compatibility fix).

🚀 New Features

  • Runtime Features: No new end-user features were introduced in this release.
  • Test Coverage Expansion: Added broader smoke/module test coverage for CJS and ESM package usage. (#7510)

🐛 Bug Fixes

  • Headers: Trim trailing CRLF in normalised header values. (#7456)
  • HTTP/2: Close detached HTTP/2 sessions on timeout to avoid lingering sessions. (#7457)
  • Fetch Adapter: Cancel ReadableStream created during request-stream capability probing to prevent async resource leaks. (#7515)
  • Proxy Handling: Fixed env proxy behavior with proxy-from-env v2 usage. (#7499)

... (truncated)

Changelog

Sourced from axios's changelog.

Changelog

1.13.3 (2026-01-20)

Bug Fixes

  • http2: Use port 443 for HTTPS connections by default. (#7256) (d7e6065)
  • interceptor: handle the error in the same interceptor (#6269) (5945e40)
  • main field in package.json should correspond to cjs artifacts (#5756) (7373fbf)
  • package.json: add 'bun' package.json 'exports' condition. Load the Node.js build in Bun instead of the browser build (#5754) (b89217e)
  • silentJSONParsing=false should throw on invalid JSON (#7253) (#7257) (7d19335)
  • turn AxiosError into a native error (#5394) (#5558) (1c6a86d)
  • types: add handlers to AxiosInterceptorManager interface (#5551) (8d1271b)
  • types: restore AxiosError.cause type from unknown to Error (#7327) (d8233d9)
  • unclear error message is thrown when specifying an empty proxy authorization (#6314) (6ef867e)

Features

Reverts

  • Revert "fix: silentJSONParsing=false should throw on invalid JSON (#7253) (#7…" (#7298) (a4230f5), closes #7253 #7 #7298
  • deps: bump peter-evans/create-pull-request from 7 to 8 in the github-actions group (#7334) (2d6ad5e)

Contributors to this release

... (truncated)

Commits
Install script changes

This version modifies prepare script that runs during installation. Review the package contents before updating.


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=axios&package-manager=npm_and_yarn&previous-version=1.13.5&new-version=1.15.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apify/crawlee/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- website/package.json | 2 +- website/yarn.lock | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/website/package.json b/website/package.json index dd7b35448216..3a36f2db62f2 100644 --- a/website/package.json +++ b/website/package.json @@ -48,7 +48,7 @@ "@giscus/react": "^3.0.0", "@mdx-js/react": "^3.0.1", "@signalwire/docusaurus-plugin-llms-txt": "^1.2.1", - "axios": "^1.13.5", + "axios": "^1.15.0", "buffer": "^6.0.3", "clsx": "^2.0.0", "crypto-browserify": "^3.12.0", diff --git a/website/yarn.lock b/website/yarn.lock index dccc837e366c..eed0a239092e 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -6206,14 +6206,14 @@ __metadata: languageName: node linkType: hard -"axios@npm:^1.13.5": - version: 1.13.5 - resolution: "axios@npm:1.13.5" +"axios@npm:^1.15.0": + version: 1.15.0 + resolution: "axios@npm:1.15.0" dependencies: follow-redirects: "npm:^1.15.11" form-data: "npm:^4.0.5" - proxy-from-env: "npm:^1.1.0" - checksum: 10c0/abf468c34f2d145f3dc7dbc0f1be67e520630624307bda69a41bbe8d386bd672d87b4405c4ee77f9ff54b235ab02f96a9968fb00e75b13ce64706e352a3068fd + proxy-from-env: "npm:^2.1.0" + checksum: 10c0/47e0f860e98d4d7aa145e89ce0cae00e1fb0f1d2485f065c21fce955ddb1dba4103a46bd0e47acd18a27208a7f62c96249e620db575521b92a968619ab133409 languageName: node linkType: hard @@ -15104,10 +15104,10 @@ __metadata: languageName: node linkType: hard -"proxy-from-env@npm:^1.1.0": - version: 1.1.0 - resolution: "proxy-from-env@npm:1.1.0" - checksum: 10c0/fe7dd8b1bdbbbea18d1459107729c3e4a2243ca870d26d34c2c1bcd3e4425b7bcc5112362df2d93cc7fb9746f6142b5e272fd1cc5c86ddf8580175186f6ad42b +"proxy-from-env@npm:^2.1.0": + version: 2.1.0 + resolution: "proxy-from-env@npm:2.1.0" + checksum: 10c0/ed01729fd4d094eab619cd7e17ce3698b3413b31eb102c4904f9875e677cd207392795d5b4adee9cec359dfd31c44d5ad7595a3a3ad51c40250e141512281c58 languageName: node linkType: hard @@ -15997,7 +15997,7 @@ __metadata: "@types/react": "npm:^19.0.0" "@typescript-eslint/eslint-plugin": "npm:^7.0.0" "@typescript-eslint/parser": "npm:^7.0.0" - axios: "npm:^1.13.5" + axios: "npm:^1.15.0" buffer: "npm:^6.0.3" clsx: "npm:^2.0.0" crypto-browserify: "npm:^3.12.0" From 9655f4cdf5caceb724e82e7363b69f6ce94fdb72 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:19:58 +0200 Subject: [PATCH 3/6] chore(deps): bump basic-ftp from 5.2.0 to 5.2.2 (#3563) Bumps [basic-ftp](https://github.com/patrickjuchli/basic-ftp) from 5.2.0 to 5.2.2.
Release notes

Sourced from basic-ftp's releases.

5.2.2

5.2.1

Changelog

Sourced from basic-ftp's changelog.

5.2.2

5.2.1

Commits
  • e9d09d6 Bump version
  • 20327d3 Move prevention of control character injection to more central place
  • ba40f9d Update dev dependencies
  • 6b0008b Bump version
  • 2ecc8e2 Reject control character injection attempts using paths
  • 515d21f Update security policy and reporting instructions
  • 9744254 Link to security advisory
  • See full diff in compare view
Maintainer changes

This version was pushed to npm by patrickjuchli, a new releaser for basic-ftp since your current version.

Install script changes

This version adds prepare script that runs during installation. Review the package contents before updating.


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=basic-ftp&package-manager=npm_and_yarn&previous-version=5.2.0&new-version=5.2.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apify/crawlee/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yarn.lock b/yarn.lock index d1a146e59337..5c1c803e1c97 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4768,9 +4768,9 @@ __metadata: linkType: hard "basic-ftp@npm:^5.0.2": - version: 5.2.0 - resolution: "basic-ftp@npm:5.2.0" - checksum: 10c0/a0f85c01deae0723021f9bf4a7be29378186fa8bba41e74ea11832fe74c187ce90c3599c3cc5ec936581cfd150020e79f4a9ed0ee9fb20b2308e69b045f3a059 + version: 5.2.2 + resolution: "basic-ftp@npm:5.2.2" + checksum: 10c0/a314a05450cf6311035d1bbb23c1ba1c8c0b991e7cb9bfafafc72a82bfafc540561c22eb046a58374688b7b9df502aa002fc28f4d366eb40964f307d131e06a6 languageName: node linkType: hard From facd4b3077867781ef35474ed90dcbdec6ef7850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Mon, 13 Apr 2026 15:20:31 +0200 Subject: [PATCH 4/6] chore(deps): bump @apify/eslint-config to ^2.0.0 (#3562) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Bumps `@apify/eslint-config` from `^1.0.0` to `^2.0.0`. v2.0.0 ([apify/apify-eslint-config#35](https://github.com/apify/apify-eslint-config/pull/35)): - drops the unmaintained `eslint-config-airbnb-base` dep - preserves the meaningful airbnb rules inline (eqeqeq, no-var, prefer-const, no-param-reassign, security rules, etc.) - moves stylistic rules to an opt-in `@apify/eslint-config/style` export ## Lint impact `yarn run lint` against this branch: **0 errors, 0 warnings.** The bump surfaced 7 dead `// eslint-disable` directives — comments for rules the new config no longer enables (`no-cond-assign`, `no-unreachable-loop`, plus generic disables in test files). Cleaned up: - `packages/core/src/storages/request_queue_v2.ts` — removed `no-cond-assign` disable - `packages/memory-storage/test/async-iteration.test.ts` — removed 3 × `no-unreachable-loop` disables - `test/core/session_pool/session_pool.test.ts` — removed 3 trailing `//eslint-disable-line` comments Zero real new lint findings. > **Note:** This branch was tested locally by bypassing `npmMinimalAgeGate` (`YARN_NPM_MINIMAL_AGE_GATE=0`) since `@apify/eslint-config@2.0.0` was published less than 24 hours ago. Once it crosses the 1-day threshold, CI will pass without intervention. ## Test plan - [x] `yarn install` - [x] `yarn run lint` — clean 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- package.json | 2 +- .../core/src/storages/request_queue_v2.ts | 1 - .../test/async-iteration.test.ts | 3 - test/core/session_pool/session_pool.test.ts | 5 +- yarn.lock | 70 ++++--------------- 5 files changed, 16 insertions(+), 65 deletions(-) diff --git a/package.json b/package.json index 7e83f2f8ee4d..2df64b378d59 100644 --- a/package.json +++ b/package.json @@ -55,7 +55,7 @@ "prepare": "husky" }, "devDependencies": { - "@apify/eslint-config": "^1.0.0", + "@apify/eslint-config": "^2.0.0", "@apify/log": "^2.4.0", "@apify/tsconfig": "^0.1.0", "@biomejs/biome": "^2.3.11", diff --git a/packages/core/src/storages/request_queue_v2.ts b/packages/core/src/storages/request_queue_v2.ts index 7dd8157d7ca0..fd9e63f23111 100644 --- a/packages/core/src/storages/request_queue_v2.ts +++ b/packages/core/src/storages/request_queue_v2.ts @@ -540,7 +540,6 @@ export class RequestQueue extends RequestProvider { this.queuePausedForMigration = true; let requestId: string | null; - // eslint-disable-next-line no-cond-assign while ((requestId = this.queueHeadIds.removeFirst()) !== null) { try { await this.client.deleteRequestLock(requestId); diff --git a/packages/memory-storage/test/async-iteration.test.ts b/packages/memory-storage/test/async-iteration.test.ts index 8303dd61d291..b3f512561c80 100644 --- a/packages/memory-storage/test/async-iteration.test.ts +++ b/packages/memory-storage/test/async-iteration.test.ts @@ -204,7 +204,6 @@ describe('Async iteration support', () => { }); test('yields strings directly, not objects', async () => { - // eslint-disable-next-line no-unreachable-loop for await (const key of kvStore.keys()) { expect(typeof key).toBe('string'); break; // Only need to check the first one @@ -291,7 +290,6 @@ describe('Async iteration support', () => { }); test('yields values directly, not KeyValueStoreRecord objects', async () => { - // eslint-disable-next-line no-unreachable-loop for await (const value of kvStore.values()) { // Should be the actual value, not a record wrapper expect(value).toStrictEqual({ data: 'key-00' }); @@ -380,7 +378,6 @@ describe('Async iteration support', () => { }); test('yields [key, value] tuples', async () => { - // eslint-disable-next-line no-unreachable-loop for await (const [key, value] of kvStore.entries()) { expect(typeof key).toBe('string'); expect(key).toBe('key-00'); diff --git a/test/core/session_pool/session_pool.test.ts b/test/core/session_pool/session_pool.test.ts index d80957279693..ec8f6565c3e1 100644 --- a/test/core/session_pool/session_pool.test.ts +++ b/test/core/session_pool/session_pool.test.ts @@ -112,18 +112,17 @@ describe('SessionPool - testing session pool', () => { await sessionPool.getSession(); let isCalled = false; // @ts-expect-error Accessing private property - const oldPick = sessionPool._pickSession; //eslint-disable-line + const oldPick = sessionPool._pickSession; // @ts-expect-error Overriding private property sessionPool._pickSession = () => { - //eslint-disable-line isCalled = true; return oldPick.bind(sessionPool)(); }; await sessionPool.getSession(); - expect(isCalled).toBe(true); //eslint-disable-line + expect(isCalled).toBe(true); }); test('should delete picked session when it is unusable and create a new one', async () => { diff --git a/yarn.lock b/yarn.lock index 5c1c803e1c97..918bf864aabe 100644 --- a/yarn.lock +++ b/yarn.lock @@ -232,25 +232,29 @@ __metadata: languageName: node linkType: hard -"@apify/eslint-config@npm:^1.0.0": - version: 1.1.0 - resolution: "@apify/eslint-config@npm:1.1.0" +"@apify/eslint-config@npm:^2.0.0": + version: 2.0.0 + resolution: "@apify/eslint-config@npm:2.0.0" dependencies: - "@eslint/compat": "npm:^1.2.6" - eslint-config-airbnb-base: "npm:^15.0.0" eslint-plugin-import: "npm:^2.32.0" eslint-plugin-simple-import-sort: "npm:^12.1.1" globals: "npm:^15.14.0" peerDependencies: + "@stylistic/eslint-plugin": ^5.0.0 + "@vitest/eslint-plugin": ^1.6.14 eslint: ^9.19.0 eslint-plugin-jest: ^28.11.0 typescript-eslint: ^8.23.0 peerDependenciesMeta: + "@stylistic/eslint-plugin": + optional: true + "@vitest/eslint-plugin": + optional: true eslint-plugin-jest: optional: true typescript-eslint: optional: true - checksum: 10c0/9c1461d859d02bbbb59a6004aa289054a7fca33e573d703ffb6fe62f021607ba298e1dba2ac8c1cc43362150be5444e0112efa98f768d8d06409c3f939671c0e + checksum: 10c0/b2139b231e735853f0d6c10b9962a8da5f49cb2388429368484471366423a5d0cc7be68d376f6bb8fdaa5825d100f52be913069b3a8b38821dc883e278225652 languageName: node linkType: hard @@ -1059,7 +1063,7 @@ __metadata: version: 0.0.0-use.local resolution: "@crawlee/root@workspace:." dependencies: - "@apify/eslint-config": "npm:^1.0.0" + "@apify/eslint-config": "npm:^2.0.0" "@apify/log": "npm:^2.4.0" "@apify/tsconfig": "npm:^0.1.0" "@biomejs/biome": "npm:^2.3.11" @@ -1665,20 +1669,6 @@ __metadata: languageName: node linkType: hard -"@eslint/compat@npm:^1.2.6": - version: 1.4.1 - resolution: "@eslint/compat@npm:1.4.1" - dependencies: - "@eslint/core": "npm:^0.17.0" - peerDependencies: - eslint: ^8.40 || 9 - peerDependenciesMeta: - eslint: - optional: true - checksum: 10c0/46f5ff884873c2e2366df55dd7b2d6b12f7f852bfba8e2a48dae4819cc5e58756deefa9b7f87f1b107af725ee883a05fcc02caf969b58fb142e790c6036a0450 - languageName: node - linkType: hard - "@eslint/config-array@npm:^0.21.1": version: 0.21.1 resolution: "@eslint/config-array@npm:0.21.1" @@ -5594,13 +5584,6 @@ __metadata: languageName: node linkType: hard -"confusing-browser-globals@npm:^1.0.10": - version: 1.0.11 - resolution: "confusing-browser-globals@npm:1.0.11" - checksum: 10c0/475d0a284fa964a5182b519af5738b5b64bf7e413cfd703c1b3496bf6f4df9f827893a9b221c0ea5873c1476835beb1e0df569ba643eff0734010c1eb780589e - languageName: node - linkType: hard - "console-control-strings@npm:^1.1.0": version: 1.1.0 resolution: "console-control-strings@npm:1.1.0" @@ -6913,21 +6896,6 @@ __metadata: languageName: node linkType: hard -"eslint-config-airbnb-base@npm:^15.0.0": - version: 15.0.0 - resolution: "eslint-config-airbnb-base@npm:15.0.0" - dependencies: - confusing-browser-globals: "npm:^1.0.10" - object.assign: "npm:^4.1.2" - object.entries: "npm:^1.1.5" - semver: "npm:^6.3.0" - peerDependencies: - eslint: ^7.32.0 || ^8.2.0 - eslint-plugin-import: ^2.25.2 - checksum: 10c0/93639d991654414756f82ad7860aac30b0dc6797277b7904ddb53ed88a32c470598696bbc6c503e066414024d305221974d3769e6642de65043bedf29cbbd30f - languageName: node - linkType: hard - "eslint-config-prettier@npm:^10.1.1": version: 10.1.8 resolution: "eslint-config-prettier@npm:10.1.8" @@ -11568,7 +11536,7 @@ __metadata: languageName: node linkType: hard -"object.assign@npm:^4.1.2, object.assign@npm:^4.1.4, object.assign@npm:^4.1.7": +"object.assign@npm:^4.1.4, object.assign@npm:^4.1.7": version: 4.1.7 resolution: "object.assign@npm:4.1.7" dependencies: @@ -11582,18 +11550,6 @@ __metadata: languageName: node linkType: hard -"object.entries@npm:^1.1.5": - version: 1.1.9 - resolution: "object.entries@npm:1.1.9" - dependencies: - call-bind: "npm:^1.0.8" - call-bound: "npm:^1.0.4" - define-properties: "npm:^1.2.1" - es-object-atoms: "npm:^1.1.1" - checksum: 10c0/d4b8c1e586650407da03370845f029aa14076caca4e4d4afadbc69cfb5b78035fd3ee7be417141abdb0258fa142e59b11923b4c44d8b1255b28f5ffcc50da7db - languageName: node - linkType: hard - "object.fromentries@npm:^2.0.8": version: 2.0.8 resolution: "object.fromentries@npm:2.0.8" @@ -13468,7 +13424,7 @@ __metadata: languageName: node linkType: hard -"semver@npm:^6.3.0, semver@npm:^6.3.1": +"semver@npm:^6.3.1": version: 6.3.1 resolution: "semver@npm:6.3.1" bin: From b23319bbe8c171541f17f343f718867e25bdb620 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 13 Apr 2026 17:57:28 +0200 Subject: [PATCH 5/6] fix: Prevent accidental request dropping with `maxRequestsPerCrawl` (#3531) --- .../src/internals/basic-crawler.ts | 22 ++-- .../core/src/enqueue_links/enqueue_links.ts | 16 +-- .../core/src/storages/request_provider.ts | 98 +++++++++++++---- .../internals/adaptive-playwright-crawler.ts | 1 + packages/utils/src/internals/iterables.ts | 30 ++++-- test/core/crawlers/basic_crawler.test.ts | 102 +++++++++++++++++- test/core/enqueue_links/enqueue_links.test.ts | 2 +- test/utils/iterables.test.ts | 48 +++++++++ 8 files changed, 270 insertions(+), 49 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 3ff8e2c63421..850dda046f97 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1202,7 +1202,6 @@ export class BasicCrawler(); - const skippedBecauseOfLimit = new Set(); const skippedBecauseOfMaxCrawlDepth = new Set(); const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this); @@ -1216,16 +1215,9 @@ export class BasicCrawler= requestLimit) { - skippedBecauseOfLimit.add(url); - continue; - } - if (maxCrawlDepth !== undefined && (request as any).crawlDepth > maxCrawlDepth) { skippedBecauseOfMaxCrawlDepth.add(url); continue; @@ -1233,14 +1225,19 @@ export class BasicCrawler 0) { this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, { @@ -1250,7 +1247,7 @@ export class BasicCrawler 0 || - skippedBecauseOfLimit.size > 0 || + skippedBecauseOfLimit.length > 0 || skippedBecauseOfMaxCrawlDepth.size > 0 ) { await Promise.all( @@ -1259,7 +1256,8 @@ export class BasicCrawler { + skippedBecauseOfLimit.map((request) => { + const url = typeof request === 'string' ? request : request.url!; return this.handleSkippedRequest({ url, reason: 'limit' }); }), [...skippedBecauseOfMaxCrawlDepth].map((url) => { diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 5d6d2fce0e55..5668e9353ee6 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -488,17 +488,19 @@ export async function enqueueLinks( return filtered; } - let requests = await createFilteredRequests(); - if (typeof limit === 'number' && limit < requests.length) { - await reportSkippedRequests(requests.slice(limit), 'enqueueLimit'); - requests = requests.slice(0, limit); - } - - const { addedRequests } = await requestQueue.addRequestsBatched(requests, { + const { addedRequests, requestsOverLimit } = await requestQueue.addRequestsBatched(await createFilteredRequests(), { forefront, waitForAllRequestsToBeAdded, + maxNewRequests: limit, }); + if (requestsOverLimit?.length !== undefined && requestsOverLimit.length > 0) { + await reportSkippedRequests( + requestsOverLimit.map((r) => ({ url: typeof r === 'string' ? r : r.url! })), + 'enqueueLimit', + ); + } + return { processedRequests: addedRequests, unprocessedRequests: [] }; } diff --git a/packages/core/src/storages/request_provider.ts b/packages/core/src/storages/request_provider.ts index dc1204abf5aa..e4105e856cb3 100644 --- a/packages/core/src/storages/request_provider.ts +++ b/packages/core/src/storages/request_provider.ts @@ -413,6 +413,7 @@ export abstract class RequestProvider implements IStorage, IRequestManager { waitForAllRequestsToBeAdded: ow.optional.boolean, batchSize: ow.optional.number, waitBetweenBatchesMillis: ow.optional.number, + maxNewRequests: ow.optional.number, }), ); @@ -454,9 +455,21 @@ export abstract class RequestProvider implements IStorage, IRequestManager { } } - const { batchSize = 1000, waitBetweenBatchesMillis = 1000 } = options; + const { batchSize = 1000, waitBetweenBatchesMillis = 1000, maxNewRequests } = options; - const chunks = peekableAsyncIterable(chunkedAsyncIterable(generateRequests(), batchSize)); + let remainingBudget = maxNewRequests ?? Infinity; + const requestsOverLimit: Source[] = []; + + // If there's a limit on the number of added requests, do not send batches bigger than the limit + const effectiveChunkSize = + maxNewRequests !== undefined ? () => Math.min(batchSize, remainingBudget) : batchSize; + + // Hold onto the underlying iterator so we can drain leftovers from it in buildResult + const requestIterator = generateRequests(); + + const chunks = peekableAsyncIterable( + chunkedAsyncIterable(requestIterator, effectiveChunkSize) as AsyncIterable, + ); const chunksIterator = chunks[Symbol.asyncIterator](); const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests: Source[], cache = true) => { @@ -480,21 +493,55 @@ export abstract class RequestProvider implements IStorage, IRequestManager { return resultsToReturn; }; - // Add initial batch of `batchSize` to process them right away + /** + * Process a chunk: send it to the queue, then update the remaining budget if maxNewRequests is active. + */ + const processChunk = async (chunk: Source[], cache = true) => { + const results = await attemptToAddToQueueAndAddAnyUnprocessed(chunk, cache); + + if (maxNewRequests !== undefined) { + remainingBudget -= results.filter((r) => !r.wasAlreadyPresent).length; + } + + return results; + }; + + /** + * Build the final result. When maxNewRequests is set, drains any remaining items + * from the underlying request iterator into requestsOverLimit. + * + * We accept the iterator explicitly (rather than closing over it) to make it obvious + * that this is the *same* iterator that `chunkedAsyncIterable` has been consuming — + * so only unconsumed items are drained. We drain `requestIterator` (not `chunks`) + * because `chunkedAsyncIterable` stops yielding when the budget-based chunk size + * drops to 0, leaving unconsumed items in the underlying iterator. + */ + const buildResult = async ( + addedRequests: ProcessedRequest[], + waitForAllRequestsToBeAdded: Promise, + unconsumedIterator: AsyncGenerator, + ): Promise => { + if (maxNewRequests !== undefined) { + for await (const request of unconsumedIterator) { + requestsOverLimit.push(request); + } + } + + return { addedRequests, waitForAllRequestsToBeAdded, requestsOverLimit }; + }; + + // Add initial batch to process right away const initialChunk = await chunksIterator.peek(); if (initialChunk === undefined) { - return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]) }; + return buildResult([], Promise.resolve([]), requestIterator); } - const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk); + const addedRequests = await processChunk(initialChunk); await chunksIterator.next(); - // If we have no more requests to add, return immediately + // If we have no more requests to add (either exhausted or budget hit), return immediately if ((await chunksIterator.peek()) === undefined) { - return { - addedRequests, - waitForAllRequestsToBeAdded: Promise.resolve([]), - }; + return buildResult(addedRequests, Promise.resolve([]), requestIterator); } // eslint-disable-next-line no-async-promise-executor @@ -502,8 +549,7 @@ export abstract class RequestProvider implements IStorage, IRequestManager { const finalAddedRequests: ProcessedRequest[] = []; for await (const requestChunk of chunks) { - finalAddedRequests.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk, false))); - + finalAddedRequests.push(...(await processChunk(requestChunk, false))); await sleep(waitBetweenBatchesMillis); } @@ -515,15 +561,12 @@ export abstract class RequestProvider implements IStorage, IRequestManager { this.inProgressRequestBatchCount -= 1; }); - // If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them - if (options.waitForAllRequestsToBeAdded) { + // When maxNewRequests is set, we must wait for all batches so we can accurately report skipped requests. + if (options.waitForAllRequestsToBeAdded || maxNewRequests !== undefined) { addedRequests.push(...(await promise)); } - return { - addedRequests, - waitForAllRequestsToBeAdded: promise, - }; + return buildResult(addedRequests, promise, requestIterator); } /** @@ -980,6 +1023,18 @@ export interface AddRequestsBatchedOptions extends RequestQueueOperationOptions * @default 1000 */ waitBetweenBatchesMillis?: number; + + /** + * If set, only this many *actually new* requests (i.e. not already present in the queue) will be added. + * Once the budget is reached, remaining requests from the iterable will be collected in + * {@apilink AddRequestsBatchedResult.requestsOverLimit|`requestsOverLimit`} instead. + * + * This is useful in combination with `maxRequestsPerCrawl` to avoid duplicate URLs consuming the budget. + * + * **Note:** Setting this option implicitly enables {@apilink AddRequestsBatchedOptions.waitForAllRequestsToBeAdded|`waitForAllRequestsToBeAdded`}, + * since all batches must complete before leftover requests can be accurately reported. + */ + maxNewRequests?: number; } export interface AddRequestsBatchedResult { @@ -1001,4 +1056,11 @@ export interface AddRequestsBatchedResult { * ``` */ waitForAllRequestsToBeAdded: Promise; + + /** + * Requests from the input that were not added to the queue because the + * {@apilink AddRequestsBatchedOptions.maxNewRequests|`maxNewRequests`} budget was reached. + * Empty when `maxNewRequests` is not set. + */ + requestsOverLimit?: Source[]; } diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 2e6d5c91f551..1766ac47dc34 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -708,6 +708,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { wasAlreadyHandled: false, })), waitForAllRequestsToBeAdded: Promise.resolve([]), + requestsOverLimit: [], }; }; // We need to use a mock request queue implementation, in order to add the requests into our result object diff --git a/packages/utils/src/internals/iterables.ts b/packages/utils/src/internals/iterables.ts index ed55bde56369..36f6d0f24ff7 100644 --- a/packages/utils/src/internals/iterables.ts +++ b/packages/utils/src/internals/iterables.ts @@ -85,24 +85,34 @@ export async function* asyncifyIterable(iterable: Iterable | AsyncIterable */ export async function* chunkedAsyncIterable( iterable: AsyncIterable | Iterable, - chunkSize: number, + chunkSize: number | (() => number), ): AsyncIterable { - if (typeof chunkSize !== 'number' || chunkSize < 1) { + const getChunkSize = typeof chunkSize === 'function' ? chunkSize : () => chunkSize; + + if (typeof chunkSize === 'number' && chunkSize < 1) { throw new Error(`Chunk size must be a positive number (${inspect(chunkSize)}) received`); } - let chunk: T[] = []; + const iterator = + Symbol.asyncIterator in iterable + ? (iterable as AsyncIterable)[Symbol.asyncIterator]() + : (iterable as Iterable)[Symbol.iterator](); + + while (true) { + const currentSize = getChunkSize(); + if (currentSize < 1) break; - for await (const item of iterable) { - chunk.push(item); + const chunk: T[] = []; - if (chunk.length >= chunkSize) { - yield chunk; - chunk = []; + for (let i = 0; i < currentSize; i++) { + const next = await iterator.next(); + if (next.done) { + break; + } + chunk.push(next.value); } - } - if (chunk.length) { + if (chunk.length === 0) break; yield chunk; } } diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index c455fbf4f402..16410e298f50 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -250,7 +250,11 @@ describe('BasicCrawler', () => { const crawler = new TestCrawler({ maxCrawlDepth: 3 }); beforeEach(() => { - addRequestsBatchedMock = vi.fn().mockImplementation(async () => ({})); + addRequestsBatchedMock = vi.fn().mockImplementation(async () => ({ + addedRequests: [], + waitForAllRequestsToBeAdded: Promise.resolve([]), + requestsOverLimit: [], + })); onSkippedRequestMock = vi.fn(); options = { @@ -1885,6 +1889,102 @@ describe('BasicCrawler', () => { ); expect(maxCrawlDepthMessages).toHaveLength(1); }); + + test('should not count duplicate URLs toward maxRequestsPerCrawl limit (addRequests)', async () => { + const requestQueue = await RequestQueue.open(); + + const crawler = new BasicCrawler({ + requestQueue, + maxRequestsPerCrawl: 5, + requestHandler: async () => {}, + }); + + // 10 duplicate links to the same URL + 1 unique link at the end + const requestsToAdd = [ + ...Array.from({ length: 10 }, () => 'http://example.com/same'), + 'http://example.com/new', + ]; + + await crawler.addRequests(requestsToAdd); + + // Both unique URLs should have been enqueued — duplicates should not consume the budget + await expect(localStorageEmulator.getRequestQueueItems()).resolves.toMatchObject([ + { url: 'http://example.com/same' }, + { url: 'http://example.com/new' }, + ]); + }); + + test('addRequestsBatched with maxNewRequests should correctly report requestsOverLimit for array input', async () => { + const queue = await RequestQueue.open(); + + const result = await queue.addRequestsBatched( + [ + { url: 'http://example.com/a' }, + { url: 'http://example.com/b' }, + { url: 'http://example.com/c' }, + { url: 'http://example.com/d' }, + { url: 'http://example.com/e' }, + ], + { maxNewRequests: 2 }, + ); + + const addedUrls = result.addedRequests.filter((r) => !r.wasAlreadyPresent).map((r) => r.uniqueKey); + + const overLimitUrls = (result.requestsOverLimit ?? []).map((r) => (typeof r === 'string' ? r : r.url)); + + expect(addedUrls).toHaveLength(2); + expect(overLimitUrls).toHaveLength(3); + }); + + test('addRequestsBatched with maxNewRequests should correctly report requestsOverLimit for generator input', async () => { + const queue = await RequestQueue.open(); + + async function* urls() { + yield { url: 'http://example.com/a' }; + yield { url: 'http://example.com/b' }; + yield { url: 'http://example.com/c' }; + yield { url: 'http://example.com/d' }; + yield { url: 'http://example.com/e' }; + } + + const result = await queue.addRequestsBatched(urls(), { maxNewRequests: 2 }); + + const addedUrls = result.addedRequests.filter((r) => !r.wasAlreadyPresent).map((r) => r.uniqueKey); + + const overLimitUrls = (result.requestsOverLimit ?? []).map((r) => (typeof r === 'string' ? r : r.url)); + + expect(addedUrls).toHaveLength(2); + expect(overLimitUrls).toHaveLength(3); + }); + + test('should not count duplicate URLs toward maxRequestsPerCrawl limit (enqueueLinks)', async () => { + const requestQueue = await RequestQueue.open(); + + const visitedUrls: string[] = []; + + const crawler = new BasicCrawler({ + requestQueue, + maxRequestsPerCrawl: 5, + requestHandler: async (context) => { + visitedUrls.push(context.request.url); + + if (context.request.label) { + return; + } + + // Enqueue 10 duplicate links + 1 new unique link + const urls = [...Array.from({ length: 10 }, () => 'http://example.com/'), 'http://example.com/new']; + + await context.enqueueLinks({ urls, label: 'child' }); + }, + }); + + await crawler.run(['http://example.com/']); + + // Both the start URL and the new URL should have been visited + expect(visitedUrls).toContain('http://example.com/'); + expect(visitedUrls).toContain('http://example.com/new'); + }); }); describe('addRequests input validation', () => { diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index fbf34c604973..cede47fec3c6 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -1008,7 +1008,7 @@ describe('enqueueLinks()', () => { for await (const request of requests) { enqueued.push({ request: typeof request === 'string' ? { url: request } : request, options }); } - return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]) }; + return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]), requestsOverLimit: [] }; }; await cheerioCrawlerEnqueueLinks({ diff --git a/test/utils/iterables.test.ts b/test/utils/iterables.test.ts index f571277c22aa..0a77ead0eddc 100644 --- a/test/utils/iterables.test.ts +++ b/test/utils/iterables.test.ts @@ -92,6 +92,54 @@ describe('chunkedAsyncIterable', () => { expect(result).toEqual([]); }); + it('should accept a callback for dynamic chunk size', async () => { + let size = 3; + const result = []; + for await (const chunk of chunkedAsyncIterable([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], () => size)) { + result.push(chunk); + size = 2; // shrink after first chunk + } + + expect(result).toEqual([[1, 2, 3], [4, 5], [6, 7], [8, 9], [10]]); + }); + + it('should stop iterating when dynamic chunk size drops to zero', async () => { + let size = 2; + const result = []; + for await (const chunk of chunkedAsyncIterable([1, 2, 3, 4, 5, 6], () => size)) { + result.push(chunk); + size = 0; // signal stop after first chunk + } + + expect(result).toEqual([[1, 2]]); + }); + + it('should leave the underlying iterator drainable after partial consumption', async () => { + async function* source() { + yield 1; + yield 2; + yield 3; + yield 4; + yield 5; + } + + const iterator = source(); + + // Consume only the first chunk via chunkedAsyncIterable + let size = 2; + for await (const _ of chunkedAsyncIterable(iterator, () => size)) { + size = 0; // stop after first chunk + } + + // The underlying iterator should still be drainable + const remaining: number[] = []; + for await (const value of iterator) { + remaining.push(value); + } + + expect(remaining).toEqual([3, 4, 5]); + }); + it('should throw error for invalid chunk size', async () => { await expect( (async () => { From 1d4f6b9ca44b8224f14584c337bd80209eea3819 Mon Sep 17 00:00:00 2001 From: "Stefan B." Date: Mon, 13 Apr 2026 18:14:42 +0200 Subject: [PATCH 6/6] feat(utils): add sitemapFilter option to parseSitemap (#3557) --- packages/utils/src/internals/sitemap.ts | 13 ++++++++++++ packages/utils/test/sitemap.test.ts | 27 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts index d74eda26445f..4b4fbada3e80 100644 --- a/packages/utils/src/internals/sitemap.ts +++ b/packages/utils/src/internals/sitemap.ts @@ -194,6 +194,13 @@ export interface ParseSitemapOptions { * @default true */ reportNetworkErrors?: boolean; + /** + * Optional filter for nested sitemap URLs discovered in sitemap index files. + * Called with the URL of each child sitemap before it is fetched. + * Return `true` to include the sitemap, `false` to skip it. + * If not provided, all nested sitemaps are followed. + */ + nestedSitemapFilter?: (sitemapUrl: string) => boolean; } export async function* parseSitemap( @@ -209,6 +216,7 @@ export async function* parseSitemap( sitemapRetries = 3, networkTimeouts, reportNetworkErrors = true, + nestedSitemapFilter, } = options ?? {}; const sources = [...initialSources]; @@ -340,6 +348,11 @@ export async function* parseSitemap( for await (const item of items) { if (item.type === 'sitemapUrl' && !visitedSitemapUrls.has(item.url)) { + if (nestedSitemapFilter && !nestedSitemapFilter(item.url)) { + log.debug(`Skipping sitemap ${item.url} due to nestedSitemapFilter.`); + continue; + } + sources.push({ type: 'url', url: item.url, depth: (source.depth ?? 0) + 1 }); if (emitNestedSitemaps) { yield { loc: item.url, originSitemapUrl: null } as any; diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts index 2fbc54928852..6a8545cb00ed 100644 --- a/packages/utils/test/sitemap.test.ts +++ b/packages/utils/test/sitemap.test.ts @@ -294,6 +294,33 @@ describe('Sitemap', () => { ); }); + it('respects nestedSitemapFilter when following sitemap indexes', async () => { + const items: SitemapUrl[] = []; + + for await (const item of parseSitemap( + [{ type: 'url', url: 'http://not-exists.com/sitemap_parent.xml' }], + undefined, + { + nestedSitemapFilter: (url) => !url.includes('sitemap_child_2'), + }, + )) { + items.push(item); + } + + expect(items).toHaveLength(5); + expect(items.every((item) => item.originSitemapUrl === 'http://not-exists.com/sitemap_child.xml')).toBe(true); + }); + + it('follows all nested sitemaps when nestedSitemapFilter is not provided', async () => { + const items: SitemapUrl[] = []; + + for await (const item of parseSitemap([{ type: 'url', url: 'http://not-exists.com/sitemap_parent.xml' }])) { + items.push(item); + } + + expect(items).toHaveLength(10); + }); + it('does not break on invalid xml', async () => { const sitemap = await Sitemap.load('http://not-exists.com/not_actual_xml.xml'); expect(sitemap.urls).toEqual([]);