Skip to content

Commit 5819610

Browse files
authored
doc: finish the examples section (#261)
### Description - doc: finish the examples section - slightly improve introduction section ### Related issues - #171 ### Testing - Code examples were executed locally. - Doc website was run locally with rendered content. ### Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed
1 parent 701075a commit 5819610

20 files changed

+801
-166
lines changed

docs/examples/add-data-to-dataset.md

Lines changed: 104 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,115 @@ id: add-data-to-dataset
33
title: Add data to dataset
44
---
55

6-
This example saves data to the default dataset. If the dataset doesn't exist, it will be created. You can save data to custom datasets by passing `dataset_id` or `dataset_name` to `push_data`.
6+
import Tabs from '@theme/Tabs';
7+
import TabItem from '@theme/TabItem';
8+
9+
This example demonstrates how to store extracted data into datasets using the `context.push_data()` helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the `push_data` method.
10+
11+
<Tabs groupId="main">
12+
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
713

814
```python
15+
import asyncio
16+
917
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1018

11-
crawler = BeautifulSoupCrawler()
1219

13-
# Function called for each URL
14-
@crawler.router.default_handler
15-
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16-
await context.push_data({
17-
"url": context.request.url,
18-
"html": context.http_response.text(),
19-
})
20+
async def main() -> None:
21+
crawler = BeautifulSoupCrawler()
22+
23+
# Define the default request handler, which will be called for every request.
24+
@crawler.router.default_handler
25+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
26+
context.log.info(f'Processing {context.request.url} ...')
27+
28+
# Extract data from the page.
29+
data = {
30+
'url': context.request.url,
31+
'title': context.soup.title.string,
32+
'html': str(context.soup)[:1000],
33+
}
34+
35+
# Push the extracted data to the default dataset.
36+
await context.push_data(data)
37+
38+
# Run the crawler with the initial list of requests.
39+
await crawler.run(
40+
[
41+
'https://crawlee.dev',
42+
'https://apify.com',
43+
'https://example.com',
44+
]
45+
)
46+
47+
48+
if __name__ == '__main__':
49+
asyncio.run(main())
50+
```
51+
52+
</TabItem>
53+
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
54+
55+
```python
56+
import asyncio
57+
58+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
59+
2060

2161
async def main() -> None:
22-
# Run the crawler
23-
await crawler.run([
24-
'http://www.example.com/page-1',
25-
'http://www.example.com/page-2',
26-
'http://www.example.com/page-3',
27-
])
28-
29-
asyncio.run(main())
62+
crawler = PlaywrightCrawler()
63+
64+
# Define the default request handler, which will be called for every request.
65+
@crawler.router.default_handler
66+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
67+
context.log.info(f'Processing {context.request.url} ...')
68+
69+
# Extract data from the page.
70+
data = {
71+
'url': context.request.url,
72+
'title': await context.page.title(),
73+
'html': str(await context.page.content())[:1000],
74+
}
75+
76+
# Push the extracted data to the default dataset.
77+
await context.push_data(data)
78+
79+
# Run the crawler with the initial list of requests.
80+
await crawler.run(
81+
[
82+
'https://crawlee.dev',
83+
'https://apify.com',
84+
'https://example.com',
85+
]
86+
)
87+
88+
89+
if __name__ == '__main__':
90+
asyncio.run(main())
91+
```
92+
93+
</TabItem>
94+
</Tabs>
95+
96+
Each item in the dataset will be stored in its own file within the following directory:
97+
98+
```text
99+
{PROJECT_FOLDER}/storage/datasets/default/
100+
```
101+
102+
For more control, you can also open a dataset manually using the asynchronous constructor `Dataset.open()` and interact with it directly:
103+
104+
```python
105+
from crawlee.storages import Dataset
106+
107+
# ...
108+
109+
async def main() -> None:
110+
# Open dataset manually using asynchronous constructor open().
111+
dataset = await Dataset.open()
112+
113+
# Interact with dataset directly.
114+
await dataset.push_data({'key': 'value'})
115+
116+
# ...
30117
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
---
2+
id: beautifulsoup-crawler
3+
title: BeautifulSoup crawler
4+
---
5+
6+
This example demonstrates how to use `BeautifulSoupCrawler` to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
7+
8+
```python
9+
import asyncio
10+
from datetime import timedelta
11+
12+
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
13+
14+
15+
async def main() -> None:
16+
# Create an instance of the BeautifulSoupCrawler class, a crawler that automatically
17+
# loads the URLs and parses their HTML using the BeautifulSoup library.
18+
crawler = BeautifulSoupCrawler(
19+
# On error, retry each page at most once.
20+
max_request_retries=1,
21+
# Increase the timeout for processing each page to 30 seconds.
22+
request_handler_timeout=timedelta(seconds=30),
23+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
24+
max_requests_per_crawl=10,
25+
)
26+
27+
# Define the default request handler, which will be called for every request.
28+
# The handler receives a context parameter, providing various properties and
29+
# helper methods. Here are a few key ones we use for demonstration:
30+
# - request: an instance of the Request class containing details such as the URL
31+
# being crawled and the HTTP method used.
32+
# - soup: the BeautifulSoup object containing the parsed HTML of the response.
33+
@crawler.router.default_handler
34+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
35+
context.log.info(f'Processing {context.request.url} ...')
36+
37+
# Extract data from the page.
38+
data = {
39+
'url': context.request.url,
40+
'title': context.soup.title.string if context.soup.title else None,
41+
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
42+
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
43+
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
44+
}
45+
46+
# Push the extracted data to the default dataset. In local configuration,
47+
# the data will be stored as JSON files in ./storage/datasets/default.
48+
await context.push_data(data)
49+
50+
# Run the crawler with the initial list of URLs.
51+
await crawler.run(['https://crawlee.dev'])
52+
53+
if __name__ == '__main__':
54+
asyncio.run(main())
55+
```
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
---
2+
id: capture-screenshots-using-playwright
3+
title: Capture screenshots using Playwright
4+
---
5+
6+
This example demonstrates how to capture screenshots of web pages using `PlaywrightCrawler` and store them in the key-value store.
7+
8+
The `PlaywrightCrawler` is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.
9+
10+
The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.
11+
12+
```python
13+
import asyncio
14+
15+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
16+
from crawlee.storages import KeyValueStore
17+
18+
19+
async def main() -> None:
20+
crawler = PlaywrightCrawler(
21+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
22+
max_requests_per_crawl=10,
23+
# Headless mode, set to False to see the browser in action.
24+
headless=False,
25+
# Browser types supported by Playwright.
26+
browser_type='chromium',
27+
)
28+
29+
# Open the default key-value store.
30+
kvs = await KeyValueStore.open()
31+
32+
# Define the default request handler, which will be called for every request.
33+
@crawler.router.default_handler
34+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
35+
context.log.info(f'Processing {context.request.url} ...')
36+
37+
# Capture the screenshot of the page using Playwright's API.
38+
screenshot = await context.page.screenshot()
39+
name = context.request.url.split('/')[-1]
40+
41+
# Store the screenshot in the key-value store.
42+
await kvs.set_value(
43+
key=f'screenshot-{name}',
44+
value=screenshot,
45+
content_type='image/png',
46+
)
47+
48+
# Run the crawler with the initial list of URLs.
49+
await crawler.run(
50+
[
51+
'https://crawlee.dev',
52+
'https://apify.com',
53+
'https://example.com',
54+
]
55+
)
56+
57+
58+
if __name__ == '__main__':
59+
asyncio.run(main())
60+
```

docs/examples/crawl-all-links-on-a-website.md

Lines changed: 0 additions & 32 deletions
This file was deleted.
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
---
2+
id: crawl-all-links-on-website
3+
title: Crawl all links on website
4+
---
5+
6+
import Tabs from '@theme/Tabs';
7+
import TabItem from '@theme/TabItem';
8+
9+
This example uses the `enqueue_links()` helper to add new links to the `RequestQueue` as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.
10+
11+
:::tip
12+
13+
If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.
14+
15+
:::
16+
17+
<Tabs groupId="main">
18+
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
19+
20+
```python
21+
import asyncio
22+
23+
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
24+
25+
26+
async def main() -> None:
27+
crawler = BeautifulSoupCrawler(
28+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
29+
max_requests_per_crawl=10,
30+
)
31+
32+
# Define the default request handler, which will be called for every request.
33+
@crawler.router.default_handler
34+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
35+
context.log.info(f'Processing {context.request.url} ...')
36+
37+
# Enqueue all links found on the page.
38+
await context.enqueue_links()
39+
40+
# Run the crawler with the initial list of requests.
41+
await crawler.run(['https://crawlee.dev'])
42+
43+
44+
if __name__ == '__main__':
45+
asyncio.run(main())
46+
```
47+
48+
</TabItem>
49+
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
50+
51+
```python
52+
import asyncio
53+
54+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
55+
56+
57+
async def main() -> None:
58+
crawler = PlaywrightCrawler(
59+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
60+
max_requests_per_crawl=10,
61+
)
62+
63+
# Define the default request handler, which will be called for every request.
64+
@crawler.router.default_handler
65+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
66+
context.log.info(f'Processing {context.request.url} ...')
67+
68+
# Enqueue all links found on the page.
69+
await context.enqueue_links()
70+
71+
# Run the crawler with the initial list of requests.
72+
await crawler.run(['https://crawlee.dev'])
73+
74+
75+
if __name__ == '__main__':
76+
asyncio.run(main())
77+
```
78+
79+
</TabItem>
80+
</Tabs>

0 commit comments

Comments
 (0)