Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^24.22.0",
"robots-parser": "^3.0.1",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
105 changes: 105 additions & 0 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import path from "path";
import fs, { WriteStream } from "fs";
import os from "os";
import fsp from "fs/promises";
import { fetch as undiciFetch } from "undici";
import robotsParser, { Robot } from "robots-parser";

import {
RedisCrawlState,
Expand Down Expand Up @@ -36,6 +38,7 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { getProxyDispatcher } from "./util/proxy.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -1249,6 +1252,96 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async _fetchRobots(url: string) {
while (true) {
const resp = await undiciFetch(url, {
headers: this.headers,
dispatcher: getProxyDispatcher(url),
});

if (resp.ok) {
return resp;
}

const retry = resp.headers.get("retry-after");

if (retry) {
logger.debug(
"Robots.txt fetch: Retry after",
{ url, retrySeconds: retry },
"robots",
);
await sleep(parseInt(retry));
continue;
}

logger.debug(
"Robots.txt not fetched",
{ url, status: resp.status },
"robots",
);
return null;
}
return null;
}

async fetchAndParseRobots(
url: string,
logDetails: LogDetails,
): Promise<Robot | null> {
// Fetch robots.txt for url's host and return parser.
// Results are cached by robots.txt URL in Redis using an LRU cache
// implementation that retains the 100 most recently used values.
const urlParser = new URL(url);
const robotsUrl = `${urlParser.origin}/robots.txt`;

const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl);
if (cachedRobots) {
logger.debug(
"Using cached robots.txt body",
{
url: robotsUrl,
...logDetails,
},
"robots",
);
return robotsParser(robotsUrl, cachedRobots);
}

try {
logger.debug(
"Fetching robots.txt",
{ url: robotsUrl, ...logDetails },
"robots",
);
const resp = await this._fetchRobots(robotsUrl);
if (!resp) {
return null;
}
const content = await resp.text();

logger.debug(
"Caching robots.txt body",
{ url: robotsUrl, ...logDetails },
"robots",
);
await this.crawlState.setCachedRobots(robotsUrl, content);

return robotsParser(robotsUrl, content);
} catch (e) {
// ignore
}
logger.warn(
"Failed to fetch robots.txt",
{
url: robotsUrl,
...logDetails,
},
"robots",
);
return null;
}

async awaitPageExtraDelay(opts: WorkerState) {
if (this.params.pageExtraDelay) {
const {
Expand Down Expand Up @@ -2462,6 +2555,18 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}

if (this.params.robots) {
const robots = await this.fetchAndParseRobots(url, logDetails);
if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) {
logger.debug(
"Page URL not queued, disallowed by robots.txt",
{ url, ...logDetails },
"links",
);
return false;
}
}

const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops, ts, pageid },
this.pageLimit,
Expand Down
7 changes: 7 additions & 0 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,13 @@ class ArgParser {
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
type: "string",
},

robots: {
describe:
"If set, fetch and respect page disallows specified in per-host robots.txt",
type: "boolean",
default: false,
},
});
}

Expand Down
2 changes: 2 additions & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;

export const ROBOTS_CACHE_LIMIT = 100;

export type ExtractSelector = {
selector: string;
extract: string;
Expand Down
1 change: 1 addition & 0 deletions src/util/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
"replay",
"proxy",
"scope",
"robots",
] as const;

export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
Expand Down
45 changes: 44 additions & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";

import { logger } from "./logger.js";

import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
import {
MAX_DEPTH,
DEFAULT_MAX_RETRIES,
ROBOTS_CACHE_LIMIT,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename } from "./storage.js";
Expand Down Expand Up @@ -200,7 +204,10 @@ export class RedisCrawlState {
fkey: string;
ekey: string;
bkey: string;
rkey: string;
lkey: string;
pageskey: string;

esKey: string;
esMap: string;

Expand Down Expand Up @@ -233,6 +240,10 @@ export class RedisCrawlState {
this.ekey = this.key + ":e";
// crawler behavior script messages
this.bkey = this.key + ":b";
// cached robots.txt bodies (per-origin)
this.rkey = this.key + ":r";
// LRU cache of robots.txt keys
this.lkey = this.key + ":l";
// pages
this.pageskey = this.key + ":pages";

Expand Down Expand Up @@ -1025,6 +1036,38 @@ return inx;
return await this.redis.lpush(this.bkey, behaviorLog);
}

async _updateRobotsAccessTime(robotsUrl: string) {
const accessTime = Date.now();
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
}

async setCachedRobots(robotsUrl: string, body: string) {
await this._updateRobotsAccessTime(robotsUrl);
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);

// prune least-recently used items in zset and robots cache if over limit
const cacheCount = await this.redis.zcard(this.lkey);
if (cacheCount > ROBOTS_CACHE_LIMIT) {
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);

for (const keyToDelete of keysToDelete) {
logger.debug(
"Deleting cached robots.txt, over cache limit",
{ url: keyToDelete },
"robots",
);
await this.redis.del(`${this.rkey}:${keyToDelete}`);
await this.redis.zrem(this.lkey, keyToDelete);
}
}
}

async getCachedRobots(robotsUrl: string) {
await this._updateRobotsAccessTime(robotsUrl);
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
}

async writeToPagesQueue(
data: Record<string, string | number | boolean | object>,
) {
Expand Down
35 changes: 35 additions & 0 deletions tests/robots_txt.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import child_process from "child_process";

test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);

const log = res.toString();

// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);

expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);

// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);

expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});
Loading