Skip to content

Commit f5e907e

Browse files
committed
feat: add sitemap endpoint
1 parent d0a10e5 commit f5e907e

File tree

8 files changed

+668
-12
lines changed

8 files changed

+668
-12
lines changed

.agent/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ Both SDKs support the following endpoints:
212212
| SmartScraper ||| AI-powered data extraction |
213213
| SearchScraper ||| Multi-website search extraction |
214214
| Markdownify ||| HTML to Markdown conversion |
215+
| Sitemap ||| Sitemap URL extraction |
215216
| SmartCrawler ||| Sitemap generation & crawling |
216217
| AgenticScraper ||| Browser automation |
217218
| Scrape ||| Basic HTML extraction |
@@ -259,6 +260,7 @@ Both SDKs support the following endpoints:
259260
- `searchScraper.js`
260261
- `crawl.js`
261262
- `markdownify.js`
263+
- `sitemap.js`
262264
- `agenticScraper.js`
263265
- `scrape.js`
264266
- `scheduledJobs.js`

scrapegraph-js/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,27 @@ const url = 'https://scrapegraphai.com/';
451451
})();
452452
```
453453

454+
### Sitemap
455+
456+
Extract all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations.
457+
458+
```javascript
459+
import { sitemap } from 'scrapegraph-js';
460+
461+
const apiKey = 'your-api-key';
462+
const websiteUrl = 'https://example.com';
463+
464+
(async () => {
465+
try {
466+
const response = await sitemap(apiKey, websiteUrl);
467+
console.log('Total URLs found:', response.urls.length);
468+
console.log('URLs:', response.urls);
469+
} catch (error) {
470+
console.error('Error:', error);
471+
}
472+
})();
473+
```
474+
454475
### Checking API Credits
455476

456477
```javascript
@@ -688,6 +709,21 @@ Starts a crawl job to extract structured data from a website and its linked page
688709

689710
Converts a webpage into clean, well-structured markdown format.
690711

712+
### Sitemap
713+
714+
#### `sitemap(apiKey, websiteUrl, options)`
715+
716+
Extracts all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations.
717+
718+
**Parameters:**
719+
- `apiKey` (string): Your ScrapeGraph AI API key
720+
- `websiteUrl` (string): The URL of the website to extract sitemap from
721+
- `options` (object, optional): Additional options
722+
- `mock` (boolean): Override mock mode for this request
723+
724+
**Returns:** Promise resolving to an object containing:
725+
- `urls` (array): List of URLs extracted from the sitemap
726+
691727
### Agentic Scraper
692728

693729
#### `agenticScraper(apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction)`
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { sitemap } from 'scrapegraph-js';
2+
import fs from 'fs';
3+
import 'dotenv/config';
4+
5+
/**
6+
* Example: Extract sitemap URLs from a website
7+
*
8+
* This example demonstrates how to use the sitemap endpoint to extract
9+
* all URLs from a website's sitemap.xml file.
10+
*/
11+
12+
// Get API key from environment variable
13+
const apiKey = process.env.SGAI_APIKEY;
14+
15+
// Target website URL
16+
const url = 'https://scrapegraphai.com/';
17+
18+
console.log('🗺️ Extracting sitemap from:', url);
19+
console.log('⏳ Please wait...\n');
20+
21+
try {
22+
// Call the sitemap endpoint
23+
const response = await sitemap(apiKey, url);
24+
25+
console.log('✅ Sitemap extracted successfully!');
26+
console.log(`📊 Total URLs found: ${response.urls.length}\n`);
27+
28+
// Display first 10 URLs
29+
console.log('📄 First 10 URLs:');
30+
response.urls.slice(0, 10).forEach((url, index) => {
31+
console.log(` ${index + 1}. ${url}`);
32+
});
33+
34+
if (response.urls.length > 10) {
35+
console.log(` ... and ${response.urls.length - 10} more URLs`);
36+
}
37+
38+
// Save the complete list to a file
39+
saveUrlsToFile(response.urls, 'sitemap_urls.txt');
40+
41+
// Save as JSON for programmatic use
42+
saveUrlsToJson(response, 'sitemap_urls.json');
43+
44+
} catch (error) {
45+
console.error('❌ Error:', error.message);
46+
process.exit(1);
47+
}
48+
49+
/**
50+
* Helper function to save URLs to a text file
51+
*/
52+
function saveUrlsToFile(urls, filename) {
53+
try {
54+
const content = urls.join('\n');
55+
fs.writeFileSync(filename, content);
56+
console.log(`\n💾 URLs saved to: ${filename}`);
57+
} catch (err) {
58+
console.error('❌ Error saving file:', err.message);
59+
}
60+
}
61+
62+
/**
63+
* Helper function to save complete response as JSON
64+
*/
65+
function saveUrlsToJson(response, filename) {
66+
try {
67+
fs.writeFileSync(filename, JSON.stringify(response, null, 2));
68+
console.log(`💾 JSON saved to: ${filename}`);
69+
} catch (err) {
70+
console.error('❌ Error saving JSON:', err.message);
71+
}
72+
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import { sitemap, smartScraper } from 'scrapegraph-js';
2+
import 'dotenv/config';
3+
4+
/**
5+
* Advanced Example: Extract sitemap and scrape selected URLs
6+
*
7+
* This example demonstrates how to combine the sitemap endpoint
8+
* with smartScraper to extract structured data from multiple pages.
9+
*/
10+
11+
const apiKey = process.env.SGAI_APIKEY;
12+
13+
// Configuration
14+
const websiteUrl = 'https://scrapegraphai.com/';
15+
const maxPagesToScrape = 3; // Limit number of pages to scrape
16+
const userPrompt = 'Extract the page title and main heading';
17+
18+
console.log('🗺️ Step 1: Extracting sitemap from:', websiteUrl);
19+
console.log('⏳ Please wait...\n');
20+
21+
try {
22+
// Step 1: Get all URLs from sitemap
23+
const sitemapResponse = await sitemap(apiKey, websiteUrl);
24+
25+
console.log('✅ Sitemap extracted successfully!');
26+
console.log(`📊 Total URLs found: ${sitemapResponse.urls.length}\n`);
27+
28+
// Step 2: Filter URLs (example: only blog posts)
29+
const filteredUrls = sitemapResponse.urls
30+
.filter(url => url.includes('/blog/') || url.includes('/post/'))
31+
.slice(0, maxPagesToScrape);
32+
33+
if (filteredUrls.length === 0) {
34+
console.log('ℹ️ No blog URLs found, using first 3 URLs instead');
35+
filteredUrls.push(...sitemapResponse.urls.slice(0, maxPagesToScrape));
36+
}
37+
38+
console.log(`🎯 Selected ${filteredUrls.length} URLs to scrape:`);
39+
filteredUrls.forEach((url, index) => {
40+
console.log(` ${index + 1}. ${url}`);
41+
});
42+
43+
// Step 3: Scrape each selected URL
44+
console.log('\n🤖 Step 2: Scraping selected URLs...\n');
45+
46+
const results = [];
47+
48+
for (let i = 0; i < filteredUrls.length; i++) {
49+
const url = filteredUrls[i];
50+
console.log(`📄 Scraping (${i + 1}/${filteredUrls.length}): ${url}`);
51+
52+
try {
53+
const scrapeResponse = await smartScraper(
54+
apiKey,
55+
url,
56+
userPrompt
57+
);
58+
59+
results.push({
60+
url: url,
61+
data: scrapeResponse.result,
62+
status: 'success'
63+
});
64+
65+
console.log(' ✅ Success');
66+
67+
// Add a small delay between requests to avoid rate limiting
68+
if (i < filteredUrls.length - 1) {
69+
await new Promise(resolve => setTimeout(resolve, 1000));
70+
}
71+
72+
} catch (error) {
73+
console.log(` ❌ Failed: ${error.message}`);
74+
results.push({
75+
url: url,
76+
error: error.message,
77+
status: 'failed'
78+
});
79+
}
80+
}
81+
82+
// Step 4: Display results
83+
console.log('\n📊 Scraping Results:\n');
84+
results.forEach((result, index) => {
85+
console.log(`${index + 1}. ${result.url}`);
86+
if (result.status === 'success') {
87+
console.log(' Status: ✅ Success');
88+
console.log(' Data:', JSON.stringify(result.data, null, 2));
89+
} else {
90+
console.log(' Status: ❌ Failed');
91+
console.log(' Error:', result.error);
92+
}
93+
console.log('');
94+
});
95+
96+
// Summary
97+
const successCount = results.filter(r => r.status === 'success').length;
98+
console.log('📈 Summary:');
99+
console.log(` ✅ Successful: ${successCount}`);
100+
console.log(` ❌ Failed: ${results.length - successCount}`);
101+
console.log(` 📊 Total: ${results.length}`);
102+
103+
} catch (error) {
104+
console.error('❌ Error:', error.message);
105+
process.exit(1);
106+
}

scrapegraph-js/index.js

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,18 @@ export { getCredits } from './src/credits.js';
77
export { sendFeedback } from './src/feedback.js';
88
export { crawl, getCrawlRequest } from './src/crawl.js';
99
export { generateSchema, getSchemaStatus, pollSchemaGeneration } from './src/schema.js';
10-
export {
11-
createScheduledJob,
12-
getScheduledJobs,
13-
getScheduledJob,
14-
updateScheduledJob,
15-
replaceScheduledJob,
16-
deleteScheduledJob,
17-
pauseScheduledJob,
18-
resumeScheduledJob,
19-
triggerScheduledJob,
20-
getJobExecutions
10+
export { sitemap } from './src/sitemap.js';
11+
export {
12+
createScheduledJob,
13+
getScheduledJobs,
14+
getScheduledJob,
15+
updateScheduledJob,
16+
replaceScheduledJob,
17+
deleteScheduledJob,
18+
pauseScheduledJob,
19+
resumeScheduledJob,
20+
triggerScheduledJob,
21+
getJobExecutions
2122
} from './src/scheduledJobs.js';
2223

2324
// Mock utilities

scrapegraph-js/src/schema.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Schema generation functionality for ScrapeGraph JavaScript SDK
33
*/
44

5-
import { handleError } from './utils/handleError.js';
5+
import handleError from './utils/handleError.js';
66

77
/**
88
* Generate a JSON schema from a user prompt

scrapegraph-js/src/sitemap.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import axios from 'axios';
2+
import handleError from './utils/handleError.js';
3+
import { isMockEnabled, getMockConfig } from './utils/mockConfig.js';
4+
import { getMockResponse } from './utils/mockResponse.js';
5+
6+
/**
7+
* Extract all URLs from a website's sitemap.
8+
* Automatically discovers sitemap from robots.txt or common sitemap locations.
9+
*
10+
* @param {string} apiKey - Your ScrapeGraph AI API key.
11+
* @param {string} websiteUrl - The URL of the website to extract sitemap from.
12+
* @param {Object} options - Optional configuration options.
13+
* @param {boolean} options.mock - Override mock mode for this request.
14+
* @returns {Promise<Object>} A promise that resolves to an object containing:
15+
* - urls: Array of URLs extracted from the sitemap
16+
* @throws {Error} Throws an error if the HTTP request fails.
17+
*
18+
* @example
19+
* // Basic usage:
20+
* const apiKey = 'your-api-key';
21+
* const websiteUrl = 'https://example.com';
22+
*
23+
* try {
24+
* const result = await sitemap(apiKey, websiteUrl);
25+
* console.log('Sitemap URLs:', result.urls);
26+
* console.log('Total URLs found:', result.urls.length);
27+
* } catch (error) {
28+
* console.error('Error:', error);
29+
* }
30+
*
31+
* @example
32+
* // Processing sitemap URLs:
33+
* const result = await sitemap(apiKey, 'https://example.com');
34+
* result.urls.forEach(url => {
35+
* console.log('Found URL:', url);
36+
* });
37+
*/
38+
export async function sitemap(apiKey, websiteUrl, options = {}) {
39+
const { mock = null } = options;
40+
41+
// Check if mock mode is enabled
42+
const useMock = mock !== null ? mock : isMockEnabled();
43+
44+
if (useMock) {
45+
console.log('🧪 Mock mode active. Returning stub for sitemap request');
46+
const mockConfig = getMockConfig();
47+
const mockData = getMockResponse('POST', 'https://api.scrapegraphai.com/v1/sitemap', mockConfig.customResponses, mockConfig.customHandler);
48+
return mockData;
49+
}
50+
51+
const endpoint = 'https://api.scrapegraphai.com/v1/sitemap';
52+
const headers = {
53+
'accept': 'application/json',
54+
'SGAI-APIKEY': apiKey,
55+
'Content-Type': 'application/json',
56+
};
57+
58+
const payload = {
59+
website_url: websiteUrl,
60+
};
61+
62+
try {
63+
const response = await axios.post(endpoint, payload, { headers });
64+
return response.data;
65+
} catch (error) {
66+
handleError(error);
67+
}
68+
}

0 commit comments

Comments
 (0)