From 066c47524d0a1fb6feb0b9d4990ccd9f5c0667f0 Mon Sep 17 00:00:00 2001 From: Gary Illyes <51719901+garyillyes@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:03:00 +0100 Subject: [PATCH 1/4] Update robots_txt.js to extract unknown rules dynamically. --- dist/robots_txt.js | 69 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/dist/robots_txt.js b/dist/robots_txt.js index 5a89778..d674a68 100644 --- a/dist/robots_txt.js +++ b/dist/robots_txt.js @@ -64,17 +64,20 @@ const parseRecords = (text)=>{ const splitOnLines = (r)=>r.split(/[\r\n]+/g).filter((e)=>e.length > 0); const lines = splitOnLines(cleanLines(text)); - const rec_types = Object.keys(RECORD_COUNT_TYPES).join('|'); - const regex = new RegExp(`(${rec_types})(?=\\s*:)`,'gi'); - - const records = [].map.call(lines, line=>{ - + // Match any valid token followed by a colon. While rule key-value pairs + // don't have a pattern definition in RFC9309, based on common rules in + // robots.txt files, we can assume the rule keys follow the pattern of the + // product token ("user-agent"), which is [a-z0-9_-]. + // https://www.rfc-editor.org/rfc/rfc9309.html#name-formal-syntax + const regex = /^([a-z0-9_-]+)\s*:\s*(.*)$/; + + const records = lines.map(line => { let rec_match = line.match(regex); if (rec_match) { return { - record_type: rec_match[0].trim(), - record_value: line.slice(line.indexOf(':') + 1).trim() + record_type: rec_match[1].trim(), + record_value: rec_match[2].trim() }; } @@ -82,9 +85,7 @@ const parseRecords = (text)=>{ record_type: 'other', record_value: line }; - - } - ); + }); return records; } @@ -108,8 +109,29 @@ return fetchWithTimeout('/robots.txt') // Record counts by type of record result.record_counts.by_type = {}; - for (let rec_type of Object.keys(RECORD_COUNT_TYPES)) { - result.record_counts.by_type[RECORD_COUNT_TYPES[rec_type]] = records.filter((e)=>e['record_type'] == rec_type).length; + + // Initialize default types to 0 so they're always present in the output. + for (let key in RECORD_COUNT_TYPES) { + result.record_counts.by_type[RECORD_COUNT_TYPES[key]] = 0; + } + + // Count all types found + for (let record of records) { + let rawType = record.record_type; + let outputKey; + + if (RECORD_COUNT_TYPES[rawType]) { + outputKey = RECORD_COUNT_TYPES[rawType]; + } else { + // Normalize unknown types for output. + outputKey = rawType.replace(/-/g, '_'); + } + + // Initialize if not already present so we can increment it. + if (result.record_counts.by_type[outputKey] === undefined) { + result.record_counts.by_type[outputKey] = 0; + } + result.record_counts.by_type[outputKey]++; } // Record counts by user-agent @@ -136,11 +158,26 @@ return fetchWithTimeout('/robots.txt') applies_to_useragent = [record.record_value]; } - } else if (record.record_type in BY_USERAGENT_TYPES) { - for (let ua of applies_to_useragent) { - counts_by_useragent[ua][BY_USERAGENT_TYPES[record.record_type]] += 1; + } else { + // Ignore sitemap records because they're not associated with a + // user-agent. + if (record.record_type !== 'sitemap') { + let outputKey; + if (BY_USERAGENT_TYPES[record.record_type]) { + outputKey = BY_USERAGENT_TYPES[record.record_type]; + } else { + outputKey = record.record_type.replace(/-/g, '_'); + } + + for (let ua of applies_to_useragent) { + // Initialize if not already present so we can increment + // it. + if (counts_by_useragent[ua][outputKey] === undefined) { + counts_by_useragent[ua][outputKey] = 0; + } + counts_by_useragent[ua][outputKey]++; + } } - } last = record.record_type; From a0f34fea60009f4155286c32773f9b1e7d64e21f Mon Sep 17 00:00:00 2001 From: Gary Illyes <51719901+garyillyes@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:38:46 +0100 Subject: [PATCH 2/4] Update dist/robots_txt.js Co-authored-by: Barry Pollard --- dist/robots_txt.js | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dist/robots_txt.js b/dist/robots_txt.js index d674a68..30a5f0c 100644 --- a/dist/robots_txt.js +++ b/dist/robots_txt.js @@ -127,11 +127,7 @@ return fetchWithTimeout('/robots.txt') outputKey = rawType.replace(/-/g, '_'); } - // Initialize if not already present so we can increment it. - if (result.record_counts.by_type[outputKey] === undefined) { - result.record_counts.by_type[outputKey] = 0; - } - result.record_counts.by_type[outputKey]++; + result.record_counts.by_type[outputKey] = (result.record_counts.by_type[outputKey] ?? 0) + 1; } // Record counts by user-agent From e8354555936d4fc61ac090cc35df9c89c55da419 Mon Sep 17 00:00:00 2001 From: Gary Illyes <51719901+garyillyes@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:38:56 +0100 Subject: [PATCH 3/4] Update dist/robots_txt.js Co-authored-by: Barry Pollard --- dist/robots_txt.js | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dist/robots_txt.js b/dist/robots_txt.js index 30a5f0c..6eda733 100644 --- a/dist/robots_txt.js +++ b/dist/robots_txt.js @@ -166,12 +166,7 @@ return fetchWithTimeout('/robots.txt') } for (let ua of applies_to_useragent) { - // Initialize if not already present so we can increment - // it. - if (counts_by_useragent[ua][outputKey] === undefined) { - counts_by_useragent[ua][outputKey] = 0; - } - counts_by_useragent[ua][outputKey]++; + counts_by_useragent[ua][outputKey] = (counts_by_useragent[ua][outputKey] ?? 0) + 1 } } } From 2840ce04fa99e5648b948dcc48b9614ab1fd9096 Mon Sep 17 00:00:00 2001 From: Gary Illyes <51719901+garyillyes@users.noreply.github.com> Date: Thu, 29 Jan 2026 18:38:55 +0100 Subject: [PATCH 4/4] Update robots_txt.js Refactor record counting to be fully dynamic rather than relying on static dictionaries with additional dynamic counting. --- dist/robots_txt.js | 51 ++++++++-------------------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/dist/robots_txt.js b/dist/robots_txt.js index 6eda733..df83725 100644 --- a/dist/robots_txt.js +++ b/dist/robots_txt.js @@ -40,22 +40,8 @@ const fetchWithTimeout = (url) => { return fetch(url, {signal: controller.signal}); } -const RECORD_COUNT_TYPES = { - 'sitemap': 'sitemap', - 'user-agent': 'user_agent', - 'allow': 'allow', - 'disallow': 'disallow', - 'crawl-delay': 'crawl_delay', - 'noindex': 'noindex', - 'other': 'other' -}; - -const BY_USERAGENT_TYPES = { - 'allow': 'allow', - 'disallow': 'disallow', - 'crawl-delay': 'crawl_delay', - 'noindex': 'noindex', - 'other': 'other' +const NON_USERAGENT_TYPES = { + 'sitemap': 'sitemap' }; const parseRecords = (text)=>{ @@ -110,23 +96,10 @@ return fetchWithTimeout('/robots.txt') // Record counts by type of record result.record_counts.by_type = {}; - // Initialize default types to 0 so they're always present in the output. - for (let key in RECORD_COUNT_TYPES) { - result.record_counts.by_type[RECORD_COUNT_TYPES[key]] = 0; - } - // Count all types found for (let record of records) { let rawType = record.record_type; - let outputKey; - - if (RECORD_COUNT_TYPES[rawType]) { - outputKey = RECORD_COUNT_TYPES[rawType]; - } else { - // Normalize unknown types for output. - outputKey = rawType.replace(/-/g, '_'); - } - + let outputKey = rawType.replace(/-/g, '_'); result.record_counts.by_type[outputKey] = (result.record_counts.by_type[outputKey] ?? 0) + 1; } @@ -141,10 +114,7 @@ return fetchWithTimeout('/robots.txt') // If empty build if (!(record.record_value in counts_by_useragent)) { - counts_by_useragent[record.record_value] = Object.values(BY_USERAGENT_TYPES).reduce((a,v)=>({ - ...a, - [v]: 0 - }), {}); + counts_by_useragent[record.record_value] = {}; } // If prior record UA, append to list, else create list of 1. @@ -155,15 +125,10 @@ return fetchWithTimeout('/robots.txt') } } else { - // Ignore sitemap records because they're not associated with a - // user-agent. - if (record.record_type !== 'sitemap') { - let outputKey; - if (BY_USERAGENT_TYPES[record.record_type]) { - outputKey = BY_USERAGENT_TYPES[record.record_type]; - } else { - outputKey = record.record_type.replace(/-/g, '_'); - } + // Ignore global records such as 'sitemap' because they're not + // associated with a user-agent. + if (!(record.record_type in NON_USERAGENT_TYPES)) { + let outputKey = record.record_type.replace(/-/g, '_'); for (let ua of applies_to_useragent) { counts_by_useragent[ua][outputKey] = (counts_by_useragent[ua][outputKey] ?? 0) + 1