Skip to content

Commit 457a2aa

Browse files
committed
feat: add integration for the api
1 parent e6f18bc commit 457a2aa

File tree

10 files changed

+80
-54
lines changed

10 files changed

+80
-54
lines changed
2.71 KB
Binary file not shown.
3.66 KB
Binary file not shown.

examples/.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SCRAPEGRAPH_API_KEY="***REMOVED***"

examples/scrape_example.py

Lines changed: 10 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,16 @@
1-
import os
2-
from dotenv import load_dotenv
31
from scrapegraphaiapisdk.scrape import scrape
4-
from pydantic import BaseModel
5-
from typing import List
6-
7-
# Load environment variables from .env file
8-
load_dotenv()
9-
10-
class Product(BaseModel):
11-
name: str
12-
price: float
13-
description: str
14-
15-
class ProductList(BaseModel):
16-
products: List[Product]
2+
from dotenv import load_dotenv # Import load_dotenv
3+
import os # Import os to access environment variables
4+
import json # Import json for beautifying output
175

186
def main():
19-
# Get API key from environment variables
7+
"""Main function to execute the scraping process."""
8+
load_dotenv()
209
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
21-
22-
# URL to scrape
23-
url = "https://example.com/products"
24-
25-
# Natural language prompt
26-
prompt = "Extract all products from this page including their names, prices, and descriptions"
27-
28-
# Create schema
29-
schema = ProductList
30-
31-
# Make the request
32-
try:
33-
result = scrape(api_key, url, prompt, schema)
34-
print(f"Scraped data: {result}")
35-
except Exception as e:
36-
print(f"Error occurred: {e}")
10+
url = "https://scrapegraphai.com/"
11+
prompt = "What does the company do?"
3712

13+
result = scrape(api_key, url, prompt)
14+
print(result)
3815
if __name__ == "__main__":
39-
main()
16+
main()

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ authors = [
1010

1111
dependencies = [
1212
"requests>=2.32.3",
13-
"pydantic>=2.9.2"
13+
"pydantic>=2.9.2",
14+
"python-dotenv>=1.0.1"
1415
]
1516

1617
license = "MIT"

requirements-dev.lock

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
-e file:.
1111
alabaster==0.7.16
1212
# via sphinx
13+
annotated-types==0.7.0
14+
# via pydantic
1315
astroid==3.3.5
1416
# via pylint
1517
babel==2.16.0
@@ -51,14 +53,21 @@ platformdirs==4.3.6
5153
# via pylint
5254
pluggy==1.5.0
5355
# via pytest
56+
pydantic==2.9.2
57+
# via scrapegraphaiapisdk
58+
pydantic-core==2.23.4
59+
# via pydantic
5460
pygments==2.18.0
5561
# via furo
5662
# via sphinx
5763
pylint==3.3.1
5864
pytest==8.0.0
5965
# via pytest-mock
6066
pytest-mock==3.14.0
67+
python-dotenv==1.0.1
68+
# via scrapegraphaiapisdk
6169
requests==2.32.3
70+
# via scrapegraphaiapisdk
6271
# via sphinx
6372
snowballstemmer==2.2.0
6473
# via sphinx
@@ -89,6 +98,8 @@ tomlkit==0.13.2
8998
# via pylint
9099
typing-extensions==4.12.2
91100
# via astroid
101+
# via pydantic
102+
# via pydantic-core
92103
# via pylint
93104
urllib3==2.2.3
94105
# via requests

requirements.lock

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,24 @@
88
# with-sources: false
99

1010
-e file:.
11+
annotated-types==0.7.0
12+
# via pydantic
13+
certifi==2024.8.30
14+
# via requests
15+
charset-normalizer==3.4.0
16+
# via requests
17+
idna==3.10
18+
# via requests
19+
pydantic==2.9.2
20+
# via scrapegraphaiapisdk
21+
pydantic-core==2.23.4
22+
# via pydantic
23+
python-dotenv==1.0.1
24+
# via scrapegraphaiapisdk
25+
requests==2.32.3
26+
# via scrapegraphaiapisdk
27+
typing-extensions==4.12.2
28+
# via pydantic
29+
# via pydantic-core
30+
urllib3==2.2.3
31+
# via requests
175 Bytes
Binary file not shown.
2.06 KB
Binary file not shown.

scrapegraphaiapisdk/scrape.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,53 @@
1-
"""
2-
This module provides a function to scrape and extract structured data from a webpage
3-
using the ScrapeGraph AI API. It allows specifying a schema for the output structure
4-
using a Pydantic model.
5-
"""
6-
71
from pydantic import BaseModel
82
import requests
3+
import argparse
4+
from typing import Optional
5+
import json
6+
7+
class ExampleSchema(BaseModel):
8+
"""Define an example schema for the output structure, if needed."""
9+
name: str
10+
description: str
911

10-
def scrape(api_key: str, url: str, prompt: str, schema: BaseModel) -> str:
12+
def scrape(api_key: str, url: str, prompt: str, schema: Optional[BaseModel] = None) -> str:
1113
"""Scrape and extract structured data from a webpage using ScrapeGraph AI.
1214
1315
Args:
14-
api_key (str): Your ScrapeGraph AI API key
15-
url (str): The URL of the webpage to scrape
16-
prompt (str): Natural language prompt describing what data to extract
17-
schema (BaseModel): Pydantic model defining the output structure.
18-
The model will be converted to JSON schema before making the request.
16+
api_key (str): Your ScrapeGraph AI API key.
17+
url (str): The URL of the webpage to scrape.
18+
prompt (str): Natural language prompt describing what data to extract.
19+
schema (Optional[BaseModel]): Pydantic model defining the output structure,
20+
if provided. The model will be converted to JSON schema before making
21+
the request.
1922
2023
Returns:
21-
str: Extracted data in JSON format matching the provided schema
24+
str: Extracted data in JSON format matching the provided schema.
2225
"""
23-
endpoint = "https://api.scrapegraph.ai/v1/scrape"
26+
endpoint = "https://sgai-api.onrender.com/api/v1/smartscraper"
2427
headers = {
25-
"Authorization": f"Bearer {api_key}",
28+
"accept": "application/json",
29+
"SGAI-API-KEY": api_key,
2630
"Content-Type": "application/json"
2731
}
2832

2933
payload = {
30-
"url": url,
31-
"prompt": prompt,
32-
"schema": schema.model_json_schema()
34+
"website_url": url,
35+
"user_prompt": prompt
3336
}
3437

35-
response = requests.post(endpoint, headers=headers, json=payload)
36-
response.raise_for_status()
38+
if schema:
39+
payload["schema"] = schema.model_json_schema()
40+
41+
try:
42+
response = requests.post(endpoint, headers=headers, json=payload)
43+
response.raise_for_status()
44+
except requests.exceptions.HTTPError as http_err:
45+
# Handle HTTP errors specifically
46+
if response.status_code == 403:
47+
return json.dumps({"error": "Access forbidden (403)", "message": "You do not have permission to access this resource."})
48+
return json.dumps({"error": "HTTP error occurred", "message": str(http_err), "status_code": response.status_code})
49+
except requests.exceptions.RequestException as e:
50+
# Handle other request exceptions (e.g., connection errors, timeouts)
51+
return json.dumps({"error": "An error occurred", "message": str(e)})
3752

3853
return response.text

0 commit comments

Comments
 (0)