-
Notifications
You must be signed in to change notification settings - Fork 511
feat: add transform_request_function parameter for SitemapRequestLoader
#1525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
b05e27b
f27ccd9
65190f2
44c3487
0cdb50e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| import asyncio | ||
| from collections.abc import Callable | ||
|
|
||
| from yarl import URL | ||
|
|
||
| from crawlee import RequestOptions, RequestTransformAction | ||
| from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext | ||
| from crawlee.http_clients import ImpitHttpClient | ||
| from crawlee.request_loaders import SitemapRequestLoader | ||
|
|
||
|
|
||
| # Create a transform_request_function that maps request options based on the host in | ||
| # the URL | ||
| def create_transform_request( | ||
| data_mapper: dict[str, dict], | ||
| ) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]: | ||
| def transform_request( | ||
| request_options: RequestOptions, | ||
| ) -> RequestOptions | RequestTransformAction: | ||
| # According to the Sitemap protocol, all URLs in a Sitemap must be from a single | ||
| # host. | ||
| request_host = URL(request_options['url']).host | ||
|
|
||
| if request_host and (mapping_data := data_mapper.get(request_host)): | ||
| # Set properties from the mapping data | ||
| if 'label' in mapping_data: | ||
| request_options['label'] = mapping_data['label'] | ||
| if 'user_data' in mapping_data: | ||
| request_options['user_data'] = mapping_data['user_data'] | ||
|
|
||
| return request_options | ||
|
|
||
| return 'unchanged' | ||
|
|
||
| return transform_request | ||
|
|
||
|
|
||
| async def main() -> None: | ||
| # Prepare data mapping for hosts | ||
| apify_host = URL('https://apify.com/sitemap.xml').host | ||
| crawlee_host = URL('https://crawlee.dev/sitemap.xml').host | ||
|
|
||
| if not apify_host or not crawlee_host: | ||
| raise ValueError('Unable to extract host from URLs') | ||
|
|
||
| data_map = { | ||
| apify_host: { | ||
| 'label': 'apify', | ||
| 'user_data': {'source': 'apify'}, | ||
| }, | ||
| crawlee_host: { | ||
| 'label': 'crawlee', | ||
| 'user_data': {'source': 'crawlee'}, | ||
| }, | ||
| } | ||
|
|
||
| # Initialize the SitemapRequestLoader with the transform function | ||
| async with SitemapRequestLoader( | ||
| # Set the sitemap URLs and the HTTP client | ||
| sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'], | ||
| http_client=ImpitHttpClient(), | ||
| transform_request_function=create_transform_request(data_map), | ||
| ) as sitemap_loader: | ||
| # Convert the sitemap loader to a request manager | ||
| request_manager = await sitemap_loader.to_tandem() | ||
|
|
||
| # Create and configure the crawler | ||
| crawler = BeautifulSoupCrawler( | ||
| request_manager=request_manager, | ||
| max_requests_per_crawl=10, | ||
| ) | ||
|
|
||
| # Create default handler for requests without a specific label | ||
| @crawler.router.default_handler | ||
| async def handler(context: BeautifulSoupCrawlingContext) -> None: | ||
| source = context.request.user_data.get('source', 'unknown') | ||
| context.log.info( | ||
| f'Processing request: {context.request.url} from source: {source}' | ||
| ) | ||
|
|
||
| # Create handler for requests labeled 'apify' | ||
| @crawler.router.handler('apify') | ||
| async def apify_handler(context: BeautifulSoupCrawlingContext) -> None: | ||
| source = context.request.user_data.get('source', 'unknown') | ||
| context.log.info( | ||
| f'Apify handler processing: {context.request.url} from source: {source}' | ||
| ) | ||
|
|
||
| # Create handler for requests labeled 'crawlee' | ||
| @crawler.router.handler('crawlee') | ||
| async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None: | ||
| source = context.request.user_data.get('source', 'unknown') | ||
| context.log.info( | ||
| f'Crawlee handler processing: {context.request.url} from source: {source}' | ||
| ) | ||
|
|
||
| await crawler.run() | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| asyncio.run(main()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| --- | ||
| id: using-sitemap-request-loader | ||
| title: Using sitemap request loader | ||
| --- | ||
|
|
||
| import ApiLink from '@site/src/components/ApiLink'; | ||
|
|
||
| import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; | ||
|
|
||
| import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py'; | ||
|
|
||
| This example demonstrates how to use <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps. | ||
|
|
||
| The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections. | ||
|
|
||
| The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels. | ||
|
|
||
| <RunnableCodeBlock className="language-python" language="python"> | ||
| {SitemapRequestLoaderExample} | ||
| </RunnableCodeBlock> | ||
|
|
||
| For more information about request loaders, see the [Request loaders guide](../guides/request-loaders). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,7 @@ | |
| from pydantic import BaseModel, ConfigDict, Field | ||
| from typing_extensions import override | ||
|
|
||
| from crawlee import Request | ||
| from crawlee import Request, RequestOptions | ||
| from crawlee._utils.docs import docs_group | ||
| from crawlee._utils.globs import Glob | ||
| from crawlee._utils.recoverable_state import RecoverableState | ||
|
|
@@ -18,9 +18,10 @@ | |
|
|
||
| if TYPE_CHECKING: | ||
| import re | ||
| from collections.abc import Sequence | ||
| from collections.abc import Callable, Sequence | ||
| from types import TracebackType | ||
|
|
||
| from crawlee import RequestTransformAction | ||
| from crawlee.http_clients import HttpClient | ||
| from crawlee.proxy_configuration import ProxyInfo | ||
| from crawlee.storage_clients.models import ProcessedRequest | ||
|
|
@@ -112,6 +113,7 @@ def __init__( | |
| exclude: list[re.Pattern[Any] | Glob] | None = None, | ||
| max_buffer_size: int = 200, | ||
| persist_state_key: str | None = None, | ||
| transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Buuut... shouldn't this also receive an URL of the origin sitemap? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think that makes sense. A sitemap cannot contain links to another domain. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes a lot of sense, thanks. But I'm afraid that this won't "click" for a lot of people. Perhaps we could add an example that showcases this? |
||
| ) -> None: | ||
| """Initialize the sitemap request loader. | ||
|
|
||
|
|
@@ -125,13 +127,17 @@ def __init__( | |
| persist_state_key: A key for persisting the loader's state in the KeyValueStore. | ||
| When provided, allows resuming from where it left off after interruption. | ||
| If None, no state persistence occurs. | ||
| transform_request_function: An optional function to transform requests | ||
| generated by the loader. It receives `RequestOptions` with `url` and should return either | ||
| modified `RequestOptions` or a `RequestTransformAction`. | ||
| """ | ||
| self._http_client = http_client | ||
| self._sitemap_urls = sitemap_urls | ||
| self._include = include | ||
| self._exclude = exclude | ||
| self._proxy_info = proxy_info | ||
| self._max_buffer_size = max_buffer_size | ||
| self._transform_request_function = transform_request_function | ||
|
|
||
| # Synchronization for queue operations | ||
| self._queue_has_capacity = asyncio.Event() | ||
|
|
@@ -313,8 +319,15 @@ async def fetch_next_request(self) -> Request | None: | |
|
|
||
| async with self._queue_lock: | ||
| url = state.url_queue.popleft() | ||
|
|
||
| request = Request.from_url(url) | ||
| request_option = RequestOptions(url=url) | ||
| if self._transform_request_function: | ||
| transform_request_option = self._transform_request_function(request_option) | ||
| if transform_request_option == 'skip': | ||
| state.total_count -= 1 | ||
| continue | ||
| if transform_request_option != 'unchanged': | ||
| request_option = transform_request_option | ||
| request = Request.from_url(**request_option) | ||
| state.in_progress.add(request.url) | ||
| if len(state.url_queue) < self._max_buffer_size: | ||
| self._queue_has_capacity.set() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we should mention that a sitemap should only contain links to the same host here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Added.