11from __future__ import annotations
22
33import asyncio
4- import codecs
5- import pickle
64from base64 import b64encode
75from urllib .parse import unquote
86
97try :
10- from scrapy import Request , Spider
118 from scrapy .settings import Settings # noqa: TCH002
129 from scrapy .utils .project import get_project_settings
1310 from scrapy .utils .python import to_bytes
14- from scrapy .utils .request import request_from_dict
1511except ImportError as exc :
1612 raise ImportError (
1713 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".' ,
1814 ) from exc
1915
20- from apify ._crypto import crypto_random_object_id
2116from apify .actor import Actor
2217from apify .storages import RequestQueue , StorageClientManager
2318
@@ -42,119 +37,6 @@ def get_running_event_loop_id() -> int:
4237 return id (asyncio .get_running_loop ())
4338
4439
45- def to_apify_request (scrapy_request : Request , spider : Spider ) -> dict :
46- """Convert a Scrapy request to an Apify request.
47-
48- Args:
49- scrapy_request: The Scrapy request to be converted.
50- spider: The Scrapy spider that the request is associated with.
51-
52- Raises:
53- TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
54-
55- Returns:
56- The converted Apify request.
57- """
58- if not isinstance (scrapy_request , Request ):
59- raise TypeError ('scrapy_request must be an instance of the scrapy.Request class' )
60-
61- call_id = crypto_random_object_id (8 )
62- Actor .log .debug (f'[{ call_id } ]: to_apify_request was called (scrapy_request={ scrapy_request } )...' )
63-
64- apify_request = {
65- 'url' : scrapy_request .url ,
66- 'method' : scrapy_request .method ,
67- }
68-
69- # Add 'id' to the apify_request
70- if scrapy_request .meta .get ('apify_request_id' ):
71- apify_request ['id' ] = scrapy_request .meta ['apify_request_id' ]
72-
73- # Add 'uniqueKey' to the apify_request
74- if scrapy_request .meta .get ('apify_request_unique_key' ):
75- apify_request ['uniqueKey' ] = scrapy_request .meta ['apify_request_unique_key' ]
76-
77- # Serialize the Scrapy Request and store it in the apify_request.
78- # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
79- # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
80- # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
81- scrapy_request_dict = scrapy_request .to_dict (spider = spider )
82- scrapy_request_dict_encoded = codecs .encode (pickle .dumps (scrapy_request_dict ), 'base64' ).decode ()
83- apify_request ['userData' ] = {'scrapy_request' : scrapy_request_dict_encoded }
84-
85- Actor .log .debug (f'[{ call_id } ]: scrapy_request was converted to the apify_request={ apify_request } ' )
86- return apify_request
87-
88-
89- def to_scrapy_request (apify_request : dict , spider : Spider ) -> Request :
90- """Convert an Apify request to a Scrapy request.
91-
92- Args:
93- apify_request: The Apify request to be converted.
94- spider: The Scrapy spider that the request is associated with.
95-
96- Raises:
97- TypeError: If the apify_request is not a dictionary.
98- ValueError: If the apify_request does not contain the required keys.
99-
100- Returns:
101- The converted Scrapy request.
102- """
103- if not isinstance (apify_request , dict ):
104- raise TypeError ('apify_request must be a dictionary' )
105-
106- required_keys = ['url' , 'method' , 'id' , 'uniqueKey' ]
107- missing_keys = [key for key in required_keys if key not in apify_request ]
108-
109- if missing_keys :
110- raise ValueError (f'apify_request must contain { ", " .join (map (repr , missing_keys ))} key(s)' )
111-
112- call_id = crypto_random_object_id (8 )
113- Actor .log .debug (f'[{ call_id } ]: to_scrapy_request was called (apify_request={ apify_request } )...' )
114-
115- # If the apify_request comes from the Scrapy
116- if 'userData' in apify_request and 'scrapy_request' in apify_request ['userData' ]:
117- # Deserialize the Scrapy Request from the apify_request.
118- # - This process involves decoding the base64-encoded request data and reconstructing
119- # the Scrapy Request object from its dictionary representation.
120- Actor .log .debug (f'[{ call_id } ]: Restoring the Scrapy Request from the apify_request...' )
121-
122- scrapy_request_dict_encoded = apify_request ['userData' ]['scrapy_request' ]
123- if not isinstance (scrapy_request_dict_encoded , str ):
124- raise TypeError ('scrapy_request_dict_encoded must be a string' )
125-
126- scrapy_request_dict = pickle .loads (codecs .decode (scrapy_request_dict_encoded .encode (), 'base64' ))
127- if not isinstance (scrapy_request_dict , dict ):
128- raise TypeError ('scrapy_request_dict must be a dictionary' )
129-
130- scrapy_request = request_from_dict (scrapy_request_dict , spider = spider )
131- if not isinstance (scrapy_request , Request ):
132- raise TypeError ('scrapy_request must be an instance of the Request class' )
133-
134- Actor .log .debug (f'[{ call_id } ]: Scrapy Request successfully reconstructed (scrapy_request={ scrapy_request } )...' )
135-
136- # Update the meta field with the meta field from the apify_request
137- meta = scrapy_request .meta or {}
138- meta .update ({'apify_request_id' : apify_request ['id' ], 'apify_request_unique_key' : apify_request ['uniqueKey' ]})
139- scrapy_request ._meta = meta # scrapy_request.meta is a property, so we have to set it like this
140-
141- # If the apify_request comes directly from the Request Queue, typically start URLs
142- else :
143- Actor .log .debug (f'[{ call_id } ]: gonna create a new Scrapy Request (cannot be restored)' )
144-
145- scrapy_request = Request (
146- url = apify_request ['url' ],
147- method = apify_request ['method' ],
148- meta = {
149- 'apify_request_id' : apify_request ['id' ],
150- 'apify_request_unique_key' : apify_request ['uniqueKey' ],
151- },
152- )
153-
154- Actor .log .debug (f'[{ call_id } ]: an apify_request was converted to the scrapy_request={ scrapy_request } ' )
155- return scrapy_request
156-
157-
15840def apply_apify_settings (* , settings : Settings | None = None , proxy_config : dict | None = None ) -> Settings :
15941 """Integrates Apify configuration into a Scrapy project settings.
16042
0 commit comments