feat: add sitemap endpoint

VinciGit00 · VinciGit00 · commit f5e907ed5de2 · 2025-10-08T12:24:47.000+02:00
diff --git a/.agent/README.md b/.agent/README.md
@@ -212,6 +212,7 @@ Both SDKs support the following endpoints:
 | SmartScraper | ✅ | ✅ | AI-powered data extraction |
 | SearchScraper | ✅ | ✅ | Multi-website search extraction |
 | Markdownify | ✅ | ✅ | HTML to Markdown conversion |
+| Sitemap | ❌ | ✅ | Sitemap URL extraction |
 | SmartCrawler | ✅ | ✅ | Sitemap generation & crawling |
 | AgenticScraper | ✅ | ✅ | Browser automation |
 | Scrape | ✅ | ✅ | Basic HTML extraction |
@@ -259,6 +260,7 @@ Both SDKs support the following endpoints:
   - `searchScraper.js`
   - `crawl.js`
   - `markdownify.js`
+  - `sitemap.js`
   - `agenticScraper.js`
   - `scrape.js`
   - `scheduledJobs.js`
diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md
@@ -451,6 +451,27 @@ const url = 'https://scrapegraphai.com/';
 })();
 ```
 
+### Sitemap
+
+Extract all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations.
+
+```javascript
+import { sitemap } from 'scrapegraph-js';
+
+const apiKey = 'your-api-key';
+const websiteUrl = 'https://example.com';
+
+(async () => {
+  try {
+    const response = await sitemap(apiKey, websiteUrl);
+    console.log('Total URLs found:', response.urls.length);
+    console.log('URLs:', response.urls);
+  } catch (error) {
+    console.error('Error:', error);
+  }
+})();
+```
+
 ### Checking API Credits
 
 ```javascript
@@ -688,6 +709,21 @@ Starts a crawl job to extract structured data from a website and its linked page
 
 Converts a webpage into clean, well-structured markdown format.
 
+### Sitemap
+
+#### `sitemap(apiKey, websiteUrl, options)`
+
+Extracts all URLs from a website's sitemap. Automatically discovers sitemap from robots.txt or common sitemap locations.
+
+**Parameters:**
+- `apiKey` (string): Your ScrapeGraph AI API key
+- `websiteUrl` (string): The URL of the website to extract sitemap from
+- `options` (object, optional): Additional options
+  - `mock` (boolean): Override mock mode for this request
+
+**Returns:** Promise resolving to an object containing:
+- `urls` (array): List of URLs extracted from the sitemap
+
 ### Agentic Scraper
 
 #### `agenticScraper(apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction)`
diff --git a/scrapegraph-js/examples/sitemap/sitemap_example.js b/scrapegraph-js/examples/sitemap/sitemap_example.js
@@ -0,0 +1,72 @@
+import { sitemap } from 'scrapegraph-js';
+import fs from 'fs';
+import 'dotenv/config';
+
+/**
+ * Example: Extract sitemap URLs from a website
+ *
+ * This example demonstrates how to use the sitemap endpoint to extract
+ * all URLs from a website's sitemap.xml file.
+ */
+
+// Get API key from environment variable
+const apiKey = process.env.SGAI_APIKEY;
+
+// Target website URL
+const url = 'https://scrapegraphai.com/';
+
+console.log('🗺️  Extracting sitemap from:', url);
+console.log('⏳ Please wait...\n');
+
+try {
+  // Call the sitemap endpoint
+  const response = await sitemap(apiKey, url);
+
+  console.log('✅ Sitemap extracted successfully!');
+  console.log(`📊 Total URLs found: ${response.urls.length}\n`);
+
+  // Display first 10 URLs
+  console.log('📄 First 10 URLs:');
+  response.urls.slice(0, 10).forEach((url, index) => {
+    console.log(`   ${index + 1}. ${url}`);
+  });
+
+  if (response.urls.length > 10) {
+    console.log(`   ... and ${response.urls.length - 10} more URLs`);
+  }
+
+  // Save the complete list to a file
+  saveUrlsToFile(response.urls, 'sitemap_urls.txt');
+
+  // Save as JSON for programmatic use
+  saveUrlsToJson(response, 'sitemap_urls.json');
+
+} catch (error) {
+  console.error('❌ Error:', error.message);
+  process.exit(1);
+}
+
+/**
+ * Helper function to save URLs to a text file
+ */
+function saveUrlsToFile(urls, filename) {
+  try {
+    const content = urls.join('\n');
+    fs.writeFileSync(filename, content);
+    console.log(`\n💾 URLs saved to: ${filename}`);
+  } catch (err) {
+    console.error('❌ Error saving file:', err.message);
+  }
+}
+
+/**
+ * Helper function to save complete response as JSON
+ */
+function saveUrlsToJson(response, filename) {
+  try {
+    fs.writeFileSync(filename, JSON.stringify(response, null, 2));
+    console.log(`💾 JSON saved to: ${filename}`);
+  } catch (err) {
+    console.error('❌ Error saving JSON:', err.message);
+  }
+}
diff --git a/scrapegraph-js/examples/sitemap/sitemap_with_smartscraper.js b/scrapegraph-js/examples/sitemap/sitemap_with_smartscraper.js
@@ -0,0 +1,106 @@
+import { sitemap, smartScraper } from 'scrapegraph-js';
+import 'dotenv/config';
+
+/**
+ * Advanced Example: Extract sitemap and scrape selected URLs
+ *
+ * This example demonstrates how to combine the sitemap endpoint
+ * with smartScraper to extract structured data from multiple pages.
+ */
+
+const apiKey = process.env.SGAI_APIKEY;
+
+// Configuration
+const websiteUrl = 'https://scrapegraphai.com/';
+const maxPagesToScrape = 3; // Limit number of pages to scrape
+const userPrompt = 'Extract the page title and main heading';
+
+console.log('🗺️  Step 1: Extracting sitemap from:', websiteUrl);
+console.log('⏳ Please wait...\n');
+
+try {
+  // Step 1: Get all URLs from sitemap
+  const sitemapResponse = await sitemap(apiKey, websiteUrl);
+
+  console.log('✅ Sitemap extracted successfully!');
+  console.log(`📊 Total URLs found: ${sitemapResponse.urls.length}\n`);
+
+  // Step 2: Filter URLs (example: only blog posts)
+  const filteredUrls = sitemapResponse.urls
+    .filter(url => url.includes('/blog/') || url.includes('/post/'))
+    .slice(0, maxPagesToScrape);
+
+  if (filteredUrls.length === 0) {
+    console.log('ℹ️  No blog URLs found, using first 3 URLs instead');
+    filteredUrls.push(...sitemapResponse.urls.slice(0, maxPagesToScrape));
+  }
+
+  console.log(`🎯 Selected ${filteredUrls.length} URLs to scrape:`);
+  filteredUrls.forEach((url, index) => {
+    console.log(`   ${index + 1}. ${url}`);
+  });
+
+  // Step 3: Scrape each selected URL
+  console.log('\n🤖 Step 2: Scraping selected URLs...\n');
+
+  const results = [];
+
+  for (let i = 0; i < filteredUrls.length; i++) {
+    const url = filteredUrls[i];
+    console.log(`📄 Scraping (${i + 1}/${filteredUrls.length}): ${url}`);
+
+    try {
+      const scrapeResponse = await smartScraper(
+        apiKey,
+        url,
+        userPrompt
+      );
+
+      results.push({
+        url: url,
+        data: scrapeResponse.result,
+        status: 'success'
+      });
+
+      console.log('   ✅ Success');
+
+      // Add a small delay between requests to avoid rate limiting
+      if (i < filteredUrls.length - 1) {
+        await new Promise(resolve => setTimeout(resolve, 1000));
+      }
+
+    } catch (error) {
+      console.log(`   ❌ Failed: ${error.message}`);
+      results.push({
+        url: url,
+        error: error.message,
+        status: 'failed'
+      });
+    }
+  }
+
+  // Step 4: Display results
+  console.log('\n📊 Scraping Results:\n');
+  results.forEach((result, index) => {
+    console.log(`${index + 1}. ${result.url}`);
+    if (result.status === 'success') {
+      console.log('   Status: ✅ Success');
+      console.log('   Data:', JSON.stringify(result.data, null, 2));
+    } else {
+      console.log('   Status: ❌ Failed');
+      console.log('   Error:', result.error);
+    }
+    console.log('');
+  });
+
+  // Summary
+  const successCount = results.filter(r => r.status === 'success').length;
+  console.log('📈 Summary:');
+  console.log(`   ✅ Successful: ${successCount}`);
+  console.log(`   ❌ Failed: ${results.length - successCount}`);
+  console.log(`   📊 Total: ${results.length}`);
+
+} catch (error) {
+  console.error('❌ Error:', error.message);
+  process.exit(1);
+}
diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js
@@ -7,17 +7,18 @@ export { getCredits } from './src/credits.js';
 export { sendFeedback } from './src/feedback.js';
 export { crawl, getCrawlRequest } from './src/crawl.js';
 export { generateSchema, getSchemaStatus, pollSchemaGeneration } from './src/schema.js';
-export { 
-  createScheduledJob, 
-  getScheduledJobs, 
-  getScheduledJob, 
-  updateScheduledJob, 
-  replaceScheduledJob, 
-  deleteScheduledJob, 
-  pauseScheduledJob, 
-  resumeScheduledJob, 
-  triggerScheduledJob, 
-  getJobExecutions 
+export { sitemap } from './src/sitemap.js';
+export {
+  createScheduledJob,
+  getScheduledJobs,
+  getScheduledJob,
+  updateScheduledJob,
+  replaceScheduledJob,
+  deleteScheduledJob,
+  pauseScheduledJob,
+  resumeScheduledJob,
+  triggerScheduledJob,
+  getJobExecutions
 } from './src/scheduledJobs.js';
 
 // Mock utilities
diff --git a/scrapegraph-js/src/schema.js b/scrapegraph-js/src/schema.js
@@ -2,7 +2,7 @@
  * Schema generation functionality for ScrapeGraph JavaScript SDK
  */
 
-import { handleError } from './utils/handleError.js';
+import handleError from './utils/handleError.js';
 
 /**
  * Generate a JSON schema from a user prompt
diff --git a/scrapegraph-js/src/sitemap.js b/scrapegraph-js/src/sitemap.js
@@ -0,0 +1,68 @@
+import axios from 'axios';
+import handleError from './utils/handleError.js';
+import { isMockEnabled, getMockConfig } from './utils/mockConfig.js';
+import { getMockResponse } from './utils/mockResponse.js';
+
+/**
+ * Extract all URLs from a website's sitemap.
+ * Automatically discovers sitemap from robots.txt or common sitemap locations.
+ *
+ * @param {string} apiKey - Your ScrapeGraph AI API key.
+ * @param {string} websiteUrl - The URL of the website to extract sitemap from.
+ * @param {Object} options - Optional configuration options.
+ * @param {boolean} options.mock - Override mock mode for this request.
+ * @returns {Promise<Object>} A promise that resolves to an object containing:
+ *   - urls: Array of URLs extracted from the sitemap
+ * @throws {Error} Throws an error if the HTTP request fails.
+ *
+ * @example
+ * // Basic usage:
+ * const apiKey = 'your-api-key';
+ * const websiteUrl = 'https://example.com';
+ *
+ * try {
+ *   const result = await sitemap(apiKey, websiteUrl);
+ *   console.log('Sitemap URLs:', result.urls);
+ *   console.log('Total URLs found:', result.urls.length);
+ * } catch (error) {
+ *   console.error('Error:', error);
+ * }
+ *
+ * @example
+ * // Processing sitemap URLs:
+ * const result = await sitemap(apiKey, 'https://example.com');
+ * result.urls.forEach(url => {
+ *   console.log('Found URL:', url);
+ * });
+ */
+export async function sitemap(apiKey, websiteUrl, options = {}) {
+  const { mock = null } = options;
+
+  // Check if mock mode is enabled
+  const useMock = mock !== null ? mock : isMockEnabled();
+
+  if (useMock) {
+    console.log('🧪 Mock mode active. Returning stub for sitemap request');
+    const mockConfig = getMockConfig();
+    const mockData = getMockResponse('POST', 'https://api.scrapegraphai.com/v1/sitemap', mockConfig.customResponses, mockConfig.customHandler);
+    return mockData;
+  }
+
+  const endpoint = 'https://api.scrapegraphai.com/v1/sitemap';
+  const headers = {
+    'accept': 'application/json',
+    'SGAI-APIKEY': apiKey,
+    'Content-Type': 'application/json',
+  };
+
+  const payload = {
+    website_url: websiteUrl,
+  };
+
+  try {
+    const response = await axios.post(endpoint, payload, { headers });
+    return response.data;
+  } catch (error) {
+    handleError(error);
+  }
+}
diff --git a/scrapegraph-js/test/sitemap_test.js b/scrapegraph-js/test/sitemap_test.js