Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions docs/examples/code_examples/using_sitemap_request_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import asyncio
from collections.abc import Callable

from yarl import URL

from crawlee import RequestOptions, RequestTransformAction
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


# Create a transform_request_function that maps request options based on the host in
# the URL
def create_transform_request(
data_mapper: dict[str, dict],
) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]:
def transform_request(
request_options: RequestOptions,
) -> RequestOptions | RequestTransformAction:
# According to the Sitemap protocol, all URLs in a Sitemap must be from a single
# host.
request_host = URL(request_options['url']).host
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should mention that a sitemap should only contain links to the same host here

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Added.


if request_host and (mapping_data := data_mapper.get(request_host)):
# Set properties from the mapping data
if 'label' in mapping_data:
request_options['label'] = mapping_data['label']
if 'user_data' in mapping_data:
request_options['user_data'] = mapping_data['user_data']

return request_options

return 'unchanged'

return transform_request


async def main() -> None:
# Prepare data mapping for hosts
apify_host = URL('https://apify.com/sitemap.xml').host
crawlee_host = URL('https://crawlee.dev/sitemap.xml').host

if not apify_host or not crawlee_host:
raise ValueError('Unable to extract host from URLs')

data_map = {
apify_host: {
'label': 'apify',
'user_data': {'source': 'apify'},
},
crawlee_host: {
'label': 'crawlee',
'user_data': {'source': 'crawlee'},
},
}

# Initialize the SitemapRequestLoader with the transform function
async with SitemapRequestLoader(
# Set the sitemap URLs and the HTTP client
sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'],
http_client=ImpitHttpClient(),
transform_request_function=create_transform_request(data_map),
) as sitemap_loader:
# Convert the sitemap loader to a request manager
request_manager = await sitemap_loader.to_tandem()

# Create and configure the crawler
crawler = BeautifulSoupCrawler(
request_manager=request_manager,
max_requests_per_crawl=10,
)

# Create default handler for requests without a specific label
@crawler.router.default_handler
async def handler(context: BeautifulSoupCrawlingContext) -> None:
source = context.request.user_data.get('source', 'unknown')
context.log.info(
f'Processing request: {context.request.url} from source: {source}'
)

# Create handler for requests labeled 'apify'
@crawler.router.handler('apify')
async def apify_handler(context: BeautifulSoupCrawlingContext) -> None:
source = context.request.user_data.get('source', 'unknown')
context.log.info(
f'Apify handler processing: {context.request.url} from source: {source}'
)

# Create handler for requests labeled 'crawlee'
@crawler.router.handler('crawlee')
async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None:
source = context.request.user_data.get('source', 'unknown')
context.log.info(
f'Crawlee handler processing: {context.request.url} from source: {source}'
)

await crawler.run()


if __name__ == '__main__':
asyncio.run(main())
22 changes: 22 additions & 0 deletions docs/examples/using_sitemap_request_loader.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
id: using-sitemap-request-loader
title: Using sitemap request loader
---

import ApiLink from '@site/src/components/ApiLink';

import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py';

This example demonstrates how to use <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps.

The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections.

The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels.

<RunnableCodeBlock className="language-python" language="python">
{SitemapRequestLoaderExample}
</RunnableCodeBlock>

For more information about request loaders, see the [Request loaders guide](../guides/request-loaders).
21 changes: 17 additions & 4 deletions src/crawlee/request_loaders/_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override

from crawlee import Request
from crawlee import Request, RequestOptions
from crawlee._utils.docs import docs_group
from crawlee._utils.globs import Glob
from crawlee._utils.recoverable_state import RecoverableState
Expand All @@ -18,9 +18,10 @@

if TYPE_CHECKING:
import re
from collections.abc import Sequence
from collections.abc import Callable, Sequence
from types import TracebackType

from crawlee import RequestTransformAction
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storage_clients.models import ProcessedRequest
Expand Down Expand Up @@ -112,6 +113,7 @@ def __init__(
exclude: list[re.Pattern[Any] | Glob] | None = None,
max_buffer_size: int = 200,
persist_state_key: str | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Buuut... shouldn't this also receive an URL of the origin sitemap?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that makes sense. A sitemap cannot contain links to another domain.
This way, users can easily create a mapping between the original link to the sitemap and the link inside transform_request_function. From my point of view, the most valuable thing that adding transform_request_function gives is the ability to add a label so that the request is processed by the appropriate handler.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes a lot of sense, thanks. But I'm afraid that this won't "click" for a lot of people. Perhaps we could add an example that showcases this?

) -> None:
"""Initialize the sitemap request loader.

Expand All @@ -125,13 +127,17 @@ def __init__(
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
When provided, allows resuming from where it left off after interruption.
If None, no state persistence occurs.
transform_request_function: An optional function to transform requests
generated by the loader. It receives `RequestOptions` with `url` and should return either
modified `RequestOptions` or a `RequestTransformAction`.
"""
self._http_client = http_client
self._sitemap_urls = sitemap_urls
self._include = include
self._exclude = exclude
self._proxy_info = proxy_info
self._max_buffer_size = max_buffer_size
self._transform_request_function = transform_request_function

# Synchronization for queue operations
self._queue_has_capacity = asyncio.Event()
Expand Down Expand Up @@ -313,8 +319,15 @@ async def fetch_next_request(self) -> Request | None:

async with self._queue_lock:
url = state.url_queue.popleft()

request = Request.from_url(url)
request_option = RequestOptions(url=url)
if self._transform_request_function:
transform_request_option = self._transform_request_function(request_option)
if transform_request_option == 'skip':
state.total_count -= 1
continue
if transform_request_option != 'unchanged':
request_option = transform_request_option
request = Request.from_url(**request_option)
state.in_progress.add(request.url)
if len(state.url_queue) < self._max_buffer_size:
self._queue_has_capacity.set()
Expand Down
35 changes: 35 additions & 0 deletions tests/unit/request_loaders/test_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from yarl import URL

from crawlee import RequestOptions, RequestTransformAction
from crawlee.http_clients._base import HttpClient
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore
Expand Down Expand Up @@ -172,3 +173,37 @@ async def test_recovery_data_persistence_for_sitemap_loading(

assert item is not None
assert item.url == next_item_in_kvs


async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
request_options['user_data'] = {'transformed': True}
return request_options

sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)],
http_client=http_client,
transform_request_function=transform_request,
)

extracted_urls = set()

while not await sitemap_loader.is_finished():
request = await sitemap_loader.fetch_next_request()
assert request is not None
assert request.user_data.get('transformed') is True

extracted_urls.add(request.url)

await sitemap_loader.mark_request_as_handled(request)

assert len(extracted_urls) == 5
assert extracted_urls == {
'http://not-exists.com/',
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
}
Loading