Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def resume_scrape(db, tf):

def verify_mirror(tf, rn):
for i in range(rn):
verifier.main(tf, i)
verifier.run_verification(tf, i)


def resume_verify_mirror(tf, rn):
Expand All @@ -326,7 +326,7 @@ def resume_verify_mirror(tf, rn):
verifier.resume_verification(tf)
else:
for i in range(rn):
verifier.main(tf, i)
verifier.run_verification(tf, i)


def delete_nodes(ptf, ctf):
Expand Down
17 changes: 9 additions & 8 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,14 +523,14 @@ def scrape_nodes(self, async=True):
"""
self.debug_logger.info("Scraping nodes, async = " + str(async))
if async:
self._scrape_pages(self.node_urls)
self.scrape_pages(self.node_urls)
else:
for elem in self.node_url_tuples:
lst = []
while len(self.node_urls) > 0 and elem[0] in self.node_urls[0]:
lst.append(self.node_urls.pop(0))
if len(lst) > 0:
self._scrape_pages(lst)
self.scrape_pages(lst)
self.debug_logger.info("Finished scraping nodes, async = " + str(async))

def scrape_registrations(self, async=True):
Expand All @@ -540,41 +540,42 @@ def scrape_registrations(self, async=True):
"""
self.debug_logger.info("Scraping registrations, async = " + str(async))
if async:
self._scrape_pages(self.registration_urls)
self.scrape_pages(self.registration_urls)
else:
for elem in self.registration_url_tuples:
lst = []
while len(self.registration_urls) > 0 and elem[0] in self.registration_urls:
lst.append(self.registration_urls.pop(0))
self._scrape_pages(lst)
self.scrape_pages(lst)
self.debug_logger.info("Finished scraping registrations, async = " + str(async))

def scrape_users(self):
"""
Wrapper method that scrape all urls in self.user_urls. Calls _scrape_pages().
"""
self.debug_logger.info("Scraping users")
self._scrape_pages(self.user_urls)
self.scrape_pages(self.user_urls)
self.debug_logger.info("Finished scraping users")

def scrape_institutions(self):
"""
Wrapper method that scrape all institution_urls. Calls _scrape_pages().
"""
self.debug_logger.info("Scraping institutions")
self._scrape_pages(self.institution_urls)
self.scrape_pages(self.institution_urls)
self.debug_logger.info("Finished scraping institutions")

def scrape_general(self):
"""
Wrapper method that scrape all general_urls. Calls _scrape_pages().
"""
self.debug_logger.info("Scraping general pages")
self._scrape_pages(self.general_urls)
self.scrape_pages(self.general_urls)
self.debug_logger.info("Finished scraping general pages")


# TODO Make semaphore value a parameter
def _scrape_pages(self, aspect_list):
def scrape_pages(self, aspect_list):
"""
Runner method that runs scrape_url()
:param aspect_list: list of url of pages to scrape
Expand Down
99 changes: 0 additions & 99 deletions pages.py

This file was deleted.

2 changes: 1 addition & 1 deletion settings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# The OSF website URL, and the API
base_urls = ['https://osf.io/', 'https://api.osf.io/v2/']
base_urls = ['https://staging.osf.io/', 'https://staging-api.osf.io/v2/']

DEBUG_LOG_FILENAME = 'debug_log.txt'
ERROR_LOG_FILENAME = 'error_log.txt'
Loading