webrecorder · tw4l · Sep 25, 2025 · Sep 29, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/package.json b/package.json
@@ -34,6 +34,7 @@
     "pixelmatch": "^5.3.0",
     "pngjs": "^7.0.0",
     "puppeteer-core": "^24.22.0",
+    "robots-parser": "^3.0.1",
     "sax": "^1.3.0",
     "sharp": "^0.32.6",
     "tsc": "^2.0.4",

diff --git a/src/crawler.ts b/src/crawler.ts
@@ -3,6 +3,8 @@ import path from "path";
 import fs, { WriteStream } from "fs";
 import os from "os";
 import fsp from "fs/promises";
+import { fetch as undiciFetch } from "undici";
+import robotsParser, { Robot } from "robots-parser";
 
 import {
   RedisCrawlState,
@@ -36,6 +38,7 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
 import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
 import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
 import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
+import { getProxyDispatcher } from "./util/proxy.js";
 
 import { Browser } from "./util/browser.js";
 
@@ -1249,6 +1252,96 @@ self.__bx_behaviors.selectMainBehavior();
     }
   }
 
+  async _fetchRobots(url: string) {
+    while (true) {
+      const resp = await undiciFetch(url, {
+        headers: this.headers,
+        dispatcher: getProxyDispatcher(url),
+      });
+
+      if (resp.ok) {
+        return resp;
+      }
+
+      const retry = resp.headers.get("retry-after");
+
+      if (retry) {
+        logger.debug(
+          "Robots.txt fetch: Retry after",
+          { url, retrySeconds: retry },
+          "robots",
+        );
+        await sleep(parseInt(retry));
+        continue;
+      }
+
+      logger.debug(
+        "Robots.txt not fetched",
+        { url, status: resp.status },
+        "robots",
+      );
+      return null;
+    }
+    return null;
+  }
+
+  async fetchAndParseRobots(
+    url: string,
+    logDetails: LogDetails,
+  ): Promise<Robot | null> {
+    // Fetch robots.txt for url's host and return parser.
+    // Results are cached by robots.txt URL in Redis using an LRU cache
+    // implementation that retains the 100 most recently used values.
+    const urlParser = new URL(url);
+    const robotsUrl = `${urlParser.origin}/robots.txt`;
+
+    const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl);
+    if (cachedRobots) {
+      logger.debug(
+        "Using cached robots.txt body",
+        {
+          url: robotsUrl,
+          ...logDetails,
+        },
+        "robots",
+      );
+      return robotsParser(robotsUrl, cachedRobots);
+    }
+
+    try {
+      logger.debug(
+        "Fetching robots.txt",
+        { url: robotsUrl, ...logDetails },
+        "robots",
+      );
+      const resp = await this._fetchRobots(robotsUrl);
+      if (!resp) {
+        return null;
+      }
+      const content = await resp.text();
+
+      logger.debug(
+        "Caching robots.txt body",
+        { url: robotsUrl, ...logDetails },
+        "robots",
+      );
+      await this.crawlState.setCachedRobots(robotsUrl, content);
+
+      return robotsParser(robotsUrl, content);
+    } catch (e) {
+      // ignore
+    }
+    logger.warn(
+      "Failed to fetch robots.txt",
+      {
+        url: robotsUrl,
+        ...logDetails,
+      },
+      "robots",
+    );
+    return null;
+  }
+
   async awaitPageExtraDelay(opts: WorkerState) {
     if (this.params.pageExtraDelay) {
       const {
@@ -2462,6 +2555,18 @@ self.__bx_behaviors.selectMainBehavior();
       return false;
     }
 
+    if (this.params.robots) {
+      const robots = await this.fetchAndParseRobots(url, logDetails);
+      if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) {
+        logger.debug(
+          "Page URL not queued, disallowed by robots.txt",
+          { url, ...logDetails },
+          "links",
+        );
+        return false;
+      }
+    }
+
     const result = await this.crawlState.addToQueue(
       { url, seedId, depth, extraHops, ts, pageid },
       this.pageLimit,

diff --git a/src/util/argParser.ts b/src/util/argParser.ts
@@ -683,6 +683,13 @@ class ArgParser {
             "path to SSH known hosts file for SOCKS5 over SSH proxy connection",
           type: "string",
         },
+
+        robots: {
+          describe:
+            "If set, fetch and respect page disallows specified in per-host robots.txt",
+          type: "boolean",
+          default: false,
+        },
       });
   }
 

diff --git a/src/util/constants.ts b/src/util/constants.ts
@@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
 export const PAGE_OP_TIMEOUT_SECS = 5;
 export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
 
+export const ROBOTS_CACHE_LIMIT = 100;
+
 export type ExtractSelector = {
   selector: string;
   extract: string;

diff --git a/src/util/logger.ts b/src/util/logger.ts
@@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
   "replay",
   "proxy",
   "scope",
+  "robots",
 ] as const;
 
 export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

diff --git a/src/util/state.ts b/src/util/state.ts
@@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
 
 import { logger } from "./logger.js";
 
-import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
+import {
+  MAX_DEPTH,
+  DEFAULT_MAX_RETRIES,
+  ROBOTS_CACHE_LIMIT,
+} from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { Frame } from "puppeteer-core";
 import { interpolateFilename } from "./storage.js";
@@ -200,7 +204,10 @@ export class RedisCrawlState {
   fkey: string;
   ekey: string;
   bkey: string;
+  rkey: string;
+  lkey: string;
   pageskey: string;
+
   esKey: string;
   esMap: string;
 
@@ -233,6 +240,10 @@ export class RedisCrawlState {
     this.ekey = this.key + ":e";
     // crawler behavior script messages
     this.bkey = this.key + ":b";
+    // cached robots.txt bodies (per-origin)
+    this.rkey = this.key + ":r";
+    // LRU cache of robots.txt keys
+    this.lkey = this.key + ":l";
     // pages
     this.pageskey = this.key + ":pages";
 
@@ -1025,6 +1036,38 @@ return inx;
     return await this.redis.lpush(this.bkey, behaviorLog);
   }
 
+  async _updateRobotsAccessTime(robotsUrl: string) {
+    const accessTime = Date.now();
+    await this.redis.zadd(this.lkey, accessTime, robotsUrl);
+  }
+
+  async setCachedRobots(robotsUrl: string, body: string) {
+    await this._updateRobotsAccessTime(robotsUrl);
+    await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
+
+    // prune least-recently used items in zset and robots cache if over limit
+    const cacheCount = await this.redis.zcard(this.lkey);
+    if (cacheCount > ROBOTS_CACHE_LIMIT) {
+      const diff = cacheCount - ROBOTS_CACHE_LIMIT;
+      const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
+
+      for (const keyToDelete of keysToDelete) {
+        logger.debug(
+          "Deleting cached robots.txt, over cache limit",
+          { url: keyToDelete },
+          "robots",
+        );
+        await this.redis.del(`${this.rkey}:${keyToDelete}`);
+        await this.redis.zrem(this.lkey, keyToDelete);
+      }
+    }
+  }
+
+  async getCachedRobots(robotsUrl: string) {
+    await this._updateRobotsAccessTime(robotsUrl);
+    return await this.redis.get(`${this.rkey}:${robotsUrl}`);
+  }
+
   async writeToPagesQueue(
     data: Record<string, string | number | boolean | object>,
   ) {

diff --git a/tests/robots_txt.test.js b/tests/robots_txt.test.js
@@ -0,0 +1,35 @@
+import child_process from "child_process";
+
+test("test robots.txt is fetched and cached", async () => {
+  const res = child_process.execSync(
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
+  );
+
+  const log = res.toString();
+
+  // robots.txt not found
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
+    ) > 0,
+  ).toBe(true);
+
+  // robots.txt found and cached
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+});