diff --git a/docs/examples/code_examples/using_sitemap_request_loader.py b/docs/examples/code_examples/using_sitemap_request_loader.py
new file mode 100644
index 0000000000..18079c51f2
--- /dev/null
+++ b/docs/examples/code_examples/using_sitemap_request_loader.py
@@ -0,0 +1,101 @@
+import asyncio
+from collections.abc import Callable
+
+from yarl import URL
+
+from crawlee import RequestOptions, RequestTransformAction
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.http_clients import ImpitHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+
+# Create a transform_request_function that maps request options based on the host in
+# the URL
+def create_transform_request(
+ data_mapper: dict[str, dict],
+) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]:
+ def transform_request(
+ request_options: RequestOptions,
+ ) -> RequestOptions | RequestTransformAction:
+ # According to the Sitemap protocol, all URLs in a Sitemap must be from a single
+ # host.
+ request_host = URL(request_options['url']).host
+
+ if request_host and (mapping_data := data_mapper.get(request_host)):
+ # Set properties from the mapping data
+ if 'label' in mapping_data:
+ request_options['label'] = mapping_data['label']
+ if 'user_data' in mapping_data:
+ request_options['user_data'] = mapping_data['user_data']
+
+ return request_options
+
+ return 'unchanged'
+
+ return transform_request
+
+
+async def main() -> None:
+ # Prepare data mapping for hosts
+ apify_host = URL('https://apify.com/sitemap.xml').host
+ crawlee_host = URL('https://crawlee.dev/sitemap.xml').host
+
+ if not apify_host or not crawlee_host:
+ raise ValueError('Unable to extract host from URLs')
+
+ data_map = {
+ apify_host: {
+ 'label': 'apify',
+ 'user_data': {'source': 'apify'},
+ },
+ crawlee_host: {
+ 'label': 'crawlee',
+ 'user_data': {'source': 'crawlee'},
+ },
+ }
+
+ # Initialize the SitemapRequestLoader with the transform function
+ async with SitemapRequestLoader(
+ # Set the sitemap URLs and the HTTP client
+ sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'],
+ http_client=ImpitHttpClient(),
+ transform_request_function=create_transform_request(data_map),
+ ) as sitemap_loader:
+ # Convert the sitemap loader to a request manager
+ request_manager = await sitemap_loader.to_tandem()
+
+ # Create and configure the crawler
+ crawler = BeautifulSoupCrawler(
+ request_manager=request_manager,
+ max_requests_per_crawl=10,
+ )
+
+ # Create default handler for requests without a specific label
+ @crawler.router.default_handler
+ async def handler(context: BeautifulSoupCrawlingContext) -> None:
+ source = context.request.user_data.get('source', 'unknown')
+ context.log.info(
+ f'Processing request: {context.request.url} from source: {source}'
+ )
+
+ # Create handler for requests labeled 'apify'
+ @crawler.router.handler('apify')
+ async def apify_handler(context: BeautifulSoupCrawlingContext) -> None:
+ source = context.request.user_data.get('source', 'unknown')
+ context.log.info(
+ f'Apify handler processing: {context.request.url} from source: {source}'
+ )
+
+ # Create handler for requests labeled 'crawlee'
+ @crawler.router.handler('crawlee')
+ async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None:
+ source = context.request.user_data.get('source', 'unknown')
+ context.log.info(
+ f'Crawlee handler processing: {context.request.url} from source: {source}'
+ )
+
+ await crawler.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/examples/using_sitemap_request_loader.mdx b/docs/examples/using_sitemap_request_loader.mdx
new file mode 100644
index 0000000000..3ed528e94e
--- /dev/null
+++ b/docs/examples/using_sitemap_request_loader.mdx
@@ -0,0 +1,22 @@
+---
+id: using-sitemap-request-loader
+title: Using sitemap request loader
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py';
+
+This example demonstrates how to use `SitemapRequestLoader` to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The `SitemapRequestLoader` processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps.
+
+The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections.
+
+The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels.
+
+
+ {SitemapRequestLoaderExample}
+
+
+For more information about request loaders, see the [Request loaders guide](../guides/request-loaders).
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
index afec4d4361..ee278e4eda 100644
--- a/src/crawlee/request_loaders/_sitemap_request_loader.py
+++ b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -9,7 +9,7 @@
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override
-from crawlee import Request
+from crawlee import Request, RequestOptions
from crawlee._utils.docs import docs_group
from crawlee._utils.globs import Glob
from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@
if TYPE_CHECKING:
import re
- from collections.abc import Sequence
+ from collections.abc import Callable, Sequence
from types import TracebackType
+ from crawlee import RequestTransformAction
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ def __init__(
exclude: list[re.Pattern[Any] | Glob] | None = None,
max_buffer_size: int = 200,
persist_state_key: str | None = None,
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
) -> None:
"""Initialize the sitemap request loader.
@@ -125,6 +127,9 @@ def __init__(
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
When provided, allows resuming from where it left off after interruption.
If None, no state persistence occurs.
+ transform_request_function: An optional function to transform requests
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
+ modified `RequestOptions` or a `RequestTransformAction`.
"""
self._http_client = http_client
self._sitemap_urls = sitemap_urls
@@ -132,6 +137,7 @@ def __init__(
self._exclude = exclude
self._proxy_info = proxy_info
self._max_buffer_size = max_buffer_size
+ self._transform_request_function = transform_request_function
# Synchronization for queue operations
self._queue_has_capacity = asyncio.Event()
@@ -313,8 +319,15 @@ async def fetch_next_request(self) -> Request | None:
async with self._queue_lock:
url = state.url_queue.popleft()
-
- request = Request.from_url(url)
+ request_option = RequestOptions(url=url)
+ if self._transform_request_function:
+ transform_request_option = self._transform_request_function(request_option)
+ if transform_request_option == 'skip':
+ state.total_count -= 1
+ continue
+ if transform_request_option != 'unchanged':
+ request_option = transform_request_option
+ request = Request.from_url(**request_option)
state.in_progress.add(request.url)
if len(state.url_queue) < self._max_buffer_size:
self._queue_has_capacity.set()
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py
index b70f63144c..1b8d652059 100644
--- a/tests/unit/request_loaders/test_sitemap_request_loader.py
+++ b/tests/unit/request_loaders/test_sitemap_request_loader.py
@@ -4,6 +4,7 @@
from yarl import URL
+from crawlee import RequestOptions, RequestTransformAction
from crawlee.http_clients._base import HttpClient
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore
@@ -172,3 +173,37 @@ async def test_recovery_data_persistence_for_sitemap_loading(
assert item is not None
assert item.url == next_item_in_kvs
+
+
+async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
+ sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+
+ def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
+ request_options['user_data'] = {'transformed': True}
+ return request_options
+
+ sitemap_loader = SitemapRequestLoader(
+ [str(sitemap_url)],
+ http_client=http_client,
+ transform_request_function=transform_request,
+ )
+
+ extracted_urls = set()
+
+ while not await sitemap_loader.is_finished():
+ request = await sitemap_loader.fetch_next_request()
+ assert request is not None
+ assert request.user_data.get('transformed') is True
+
+ extracted_urls.add(request.url)
+
+ await sitemap_loader.mark_request_as_handled(request)
+
+ assert len(extracted_urls) == 5
+ assert extracted_urls == {
+ 'http://not-exists.com/',
+ 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
+ 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
+ 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
+ 'http://not-exists.com/catalog?item=83&desc=vacation_usa',
+ }