diff --git a/cli.py b/cli.py index ac45380..34933bd 100644 --- a/cli.py +++ b/cli.py @@ -315,7 +315,7 @@ def resume_scrape(db, tf): def verify_mirror(tf, rn): for i in range(rn): - verifier.main(tf, i) + verifier.run_verification(tf, i) def resume_verify_mirror(tf, rn): @@ -326,7 +326,7 @@ def resume_verify_mirror(tf, rn): verifier.resume_verification(tf) else: for i in range(rn): - verifier.main(tf, i) + verifier.run_verification(tf, i) def delete_nodes(ptf, ctf): diff --git a/crawler.py b/crawler.py index 94d6dbd..0cca91b 100644 --- a/crawler.py +++ b/crawler.py @@ -523,14 +523,14 @@ def scrape_nodes(self, async=True): """ self.debug_logger.info("Scraping nodes, async = " + str(async)) if async: - self._scrape_pages(self.node_urls) + self.scrape_pages(self.node_urls) else: for elem in self.node_url_tuples: lst = [] while len(self.node_urls) > 0 and elem[0] in self.node_urls[0]: lst.append(self.node_urls.pop(0)) if len(lst) > 0: - self._scrape_pages(lst) + self.scrape_pages(lst) self.debug_logger.info("Finished scraping nodes, async = " + str(async)) def scrape_registrations(self, async=True): @@ -540,13 +540,13 @@ def scrape_registrations(self, async=True): """ self.debug_logger.info("Scraping registrations, async = " + str(async)) if async: - self._scrape_pages(self.registration_urls) + self.scrape_pages(self.registration_urls) else: for elem in self.registration_url_tuples: lst = [] while len(self.registration_urls) > 0 and elem[0] in self.registration_urls: lst.append(self.registration_urls.pop(0)) - self._scrape_pages(lst) + self.scrape_pages(lst) self.debug_logger.info("Finished scraping registrations, async = " + str(async)) def scrape_users(self): @@ -554,7 +554,7 @@ def scrape_users(self): Wrapper method that scrape all urls in self.user_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping users") - self._scrape_pages(self.user_urls) + self.scrape_pages(self.user_urls) self.debug_logger.info("Finished scraping users") def scrape_institutions(self): @@ -562,7 +562,7 @@ def scrape_institutions(self): Wrapper method that scrape all institution_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping institutions") - self._scrape_pages(self.institution_urls) + self.scrape_pages(self.institution_urls) self.debug_logger.info("Finished scraping institutions") def scrape_general(self): @@ -570,11 +570,12 @@ def scrape_general(self): Wrapper method that scrape all general_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping general pages") - self._scrape_pages(self.general_urls) + self.scrape_pages(self.general_urls) self.debug_logger.info("Finished scraping general pages") + # TODO Make semaphore value a parameter - def _scrape_pages(self, aspect_list): + def scrape_pages(self, aspect_list): """ Runner method that runs scrape_url() :param aspect_list: list of url of pages to scrape diff --git a/pages.py b/pages.py deleted file mode 100644 index e5de596..0000000 --- a/pages.py +++ /dev/null @@ -1,99 +0,0 @@ -""" The page superclass and subclasses for verifier""" - -from bs4 import BeautifulSoup -from settings import base_urls -import os - -MIRROR = 'archive/' - - -# Superclass for page-specific page instances -class Page: - def __init__(self, url): - self.url = url - self.path = self.get_path_from_url(url) - # Set size attribute in KB, inherently checks if file exists - try: - self.file_size = os.path.getsize(self.path) / 1000 - except FileNotFoundError: - raise FileNotFoundError - - def __str__(self): - return self.path - - # Takes a URL and produces its relative file name. - def get_path_from_url(self, url): - # Remove http://domain - tail = url.replace(base_urls[0], '') + 'index.html' - path = MIRROR + tail - return path - - def get_content(self): - soup = BeautifulSoup(open(self.path), 'html.parser') - return soup - - -# Page-specific subclasses -class ProjectDashboardPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectFilesPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectWikiPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectAnalyticsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectRegistrationsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectForksPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationDashboardPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationFilesPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationWikiPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationAnalyticsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationForksPage(Page): - def __init__(self, url): - super().__init__(url) - - -class UserProfilePage(Page): - def __init__(self, url): - super().__init__(url) - - -class InstitutionDashboardPage(Page): - def __init__(self, url): - super().__init__(url) \ No newline at end of file diff --git a/settings.py b/settings.py index 5106b05..3d66fae 100644 --- a/settings.py +++ b/settings.py @@ -1,5 +1,5 @@ # The OSF website URL, and the API -base_urls = ['https://osf.io/', 'https://api.osf.io/v2/'] +base_urls = ['https://staging.osf.io/', 'https://staging-api.osf.io/v2/'] DEBUG_LOG_FILENAME = 'debug_log.txt' ERROR_LOG_FILENAME = 'error_log.txt' \ No newline at end of file diff --git a/verifier.py b/verifier.py index 8a7f350..871383f 100644 --- a/verifier.py +++ b/verifier.py @@ -1,508 +1,303 @@ import json import codecs -from pages import ProjectDashboardPage, ProjectFilesPage, ProjectAnalyticsPage, \ - ProjectForksPage, ProjectRegistrationsPage, ProjectWikiPage, RegistrationDashboardPage, RegistrationFilesPage, \ - RegistrationAnalyticsPage, RegistrationForksPage, RegistrationWikiPage, UserProfilePage, InstitutionDashboardPage from crawler import Crawler -import bs4 +from bs4 import BeautifulSoup +from settings import base_urls +import os +MIRROR = 'archive/' -# Verifier superclass -class Verifier: - def __init__(self, min_size, pg_type, end): + +# Superclass for page-specific page instances +class Page: + """ + A Page class is designed to hold an instance of a page scraped. + It's attributes are: + url = the url of the page + path = the file path of the page + """ + + def __init__(self, url): + """ + Constructor for the Page class + + :param url: The url of the page """ - :param min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file. - :param pg_type: The class to instantiate page objects with. - :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL) + self.url = url + self.path = self.get_path_from_url(url) + # Set size attribute in KB, inherently checks if file exists + try: + self.file_size = os.path.getsize(self.path) / 1000 + except FileNotFoundError: + raise FileNotFoundError + + def __str__(self): + return self.path + + # Takes a URL and produces its relative file name. + def get_path_from_url(self, url): """ - self.minimum_size = min_size - self.page_type = pg_type - self.url_end = end + Specifies the file path the page scraped is meant to have. - # Certain elements will be absent if there's no content for them to display, so we check if there is a loading - # bar in its place. This means the element should exist, but it doesn't. - self.loading_elements = {} + :param url: The url of the page + """ + # Remove http://domain + tail = url.replace(base_urls[0], '') + 'index.html' + path = MIRROR + tail + return path + + def get_content(self): + """ + Returns the content of the page scraped. + """ + soup = BeautifulSoup(open(self.path), 'html.parser') + return soup - # Other elements will be replaced by a message if there's no content for them (e.g. "This user has no projects") - # We check for the elements and their alternates if the original isn't found. - self.alternate_elements = {} +# Verifier superclass +class Verifier: + """ + A Verifier class for verification of the OSF Mirror. + A CLI is designed to work with this verifier in order to ensure that everything that is scraped, is verified. + Basic Workflow: + 1. Init + 2. All urls from json file run through harvest_pages. Failed pages get sent to rescrape. + 3. Remaining urls run through size_comparison. Failed pages get sent to rescrape. + 4. Rescrape failed urls. + 5. Verify the pages that were just rescraped. + """ + + def __init__(self): + """ + Constructor for the Verifier class + + min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file. + pages: All the page objects + failed_pages: Pages that failed verification and are being sent to rescrape. + """ + self.minimum_size = 8 self.pages = [] # All the page objects self.failed_pages = [] # Populate self.pages with the relevant files - def harvest_pages(self, json_dictionary, json_list): + def harvest_pages(self, json_dictionary, json_list, first_run): """ + On the first run of verification, puts all urls in error_list directly into failed_pages. + Otherwise, tries to create page objects unless scraped file cannot be found in which case the url is added + to failed pages. + :param json_dictionary: The dictionary created from the json file :param json_list: The list in the json file of found URLs - :return: Null, but self.pages is populated. + :param first_run: True if the first_run of verification has been completed. False, otherwise. """ - for url in json_list[:]: - if self.url_end in url: - print('rel: ', url) - if url in json_dictionary['error_list']: + if json_dictionary['error_list'] is not None: + for url in json_list[:]: + # print('rel: ', url) + if url in json_dictionary['error_list'] and first_run: self.failed_pages.append(url) - print('error: ', url) + # print('error: ', url) else: try: - obj = self.page_type(url) + obj = Page(url) self.pages.append(obj) except FileNotFoundError: + # print("Failed harvest_pages ", url) self.failed_pages.append(url) json_list.remove(url) # Compare page size to page-specific minimum that any fully-scraped page should have def size_comparison(self): + """ + Checks the file size of every page instance against the minimum size specified in the constructor. + Pages that fail get added to failed_pages to be sent to rescrape. + """ for page in self.pages[:]: - # print(page) - # print(page.file_size) if not page.file_size > self.minimum_size: - print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) + # print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) self.failed_pages.append(page.url) self.pages.remove(page) return - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) # Is a loading bar present? - if len(loading_bar_result) > 0: # A loading bar exists (so content does not exist completely) - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return + def run_verifier(self, json_dictionary, json_list, first_run): + """ + Runs the verifier. - def run_verifier(self, json_filename, json_list): - self.harvest_pages(json_filename, json_list) + :param json_dictionary: The dictionary created from the json file + :param json_list: The list in the json file of found URLs + :param first_run: True if the first_run of verification has been completed. False, otherwise. + """ + self.harvest_pages(json_dictionary, json_list, first_run) self.size_comparison() - # self.spot_check() - - -# Verifier subclasses - -class ProjectDashboardVerifier(Verifier): - def __init__(self): - super().__init__(410, ProjectDashboardPage, '') - self.loading_elements = { - "#treeGrid > div > p": '#tb-tbody', # Files list - "#containment": "#render-node", # Exists if there are supposed to be components / Is it filled? - } - self.alternate_elements = { - '#nodeTitleEditable': '', # Title - '#contributors span.date.node-last-modified-date': '', # Last modified - '#contributorsList > ol': '', # Contributor list - '#tb-tbody': '', # File list - '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p' - # Activity / "Unable to retrieve at this time" - } - - # Override: the loader for loading_elements is still supposed to exist - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) - if len(loading_bar_result) > 0: # Container div is present - final_result = soup.select(final_element) - if len(final_result) == 0: # Final element isn't in place - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, - " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return - - -class ProjectFilesVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectFilesPage, "files/") - self.alternate_elements = { - '.fg-file-links': '', # Links to files (names them) - } -class ProjectWikiVerifier(Verifier): - def __init__(self): - super().__init__(410, ProjectWikiPage, "wiki/") - self.alternate_elements = { - '#wikiViewRender': '#wikiViewRender > p > em', # Wiki content / `No wiki content` - '#viewVersionSelect option': '', # Current version date modified - '.fg-file-links': '' # Links to other pages (names them) - } - - -class ProjectAnalyticsVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectAnalyticsPage, "analytics/") - self.alternate_elements = { - '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # Warning about AdBlock - 'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # External frame for analytics - } - - -class ProjectRegistrationsVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectRegistrationsPage, "registrations/") - self.alternate_elements = { - '#renderNode': '#registrations > div > div > p' # List of nodes - } - - -class ProjectForksVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectForksPage, "forks/") - self.alternate_elements = { - '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p' # List - } - - -class RegistrationDashboardVerifier(Verifier): - def __init__(self): - super().__init__(410, RegistrationDashboardPage, "") - self.loading_elements = { - "#treeGrid > div > p": '#tb-tbody', # Files list - "#containment": "#render-node", # Exists if there are supposed to be components / Is it filled? - } - self.alternate_elements = { - '#nodeTitleEditable': '', # Title - '#contributors > div > p:nth-of-type(5) > span': '', # Last modified - '#contributorsList > ol': '', # Contributor list - '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p' - # Activity / "Unable to retrieve at this time" - } - - # Override: the loader for loading_elements is still supposed to exist - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) - if len(loading_bar_result) > 0: # Container div is present - final_result = soup.select(final_element) - if len(final_result) == 0: # Final element isn't in place - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, - " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return - - -class RegistrationFilesVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationFilesPage, "files/") - self.alternate_elements = { - '.fg-file-links': '', # Links to files (names them) - } - - -class RegistrationWikiVerifier(Verifier): - def __init__(self): - super().__init__(410, RegistrationWikiPage, "wiki/") - self.alternate_elements = { - '#wikiViewRender': '#wikiViewRender > p > em', # Wiki content / `No wiki content` - '#viewVersionSelect option': '', # Current version date modified - '.fg-file-links': '' # Links to other pages (names them) - } - - -class RegistrationAnalyticsVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationAnalyticsPage, "analytics/") - self.alternate_elements = { - '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # Warning about AdBlock - 'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # External frame for analytics - } - - -class RegistrationForksVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationForksPage, "forks/") - self.alternate_elements = { - '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p' # List - } - - -class UserProfileVerifier(Verifier): - def __init__(self): - super().__init__(80, UserProfilePage, "") - self.alternate_elements = { - '#projects': 'div > div:nth-of-type(1) > div > div.panel-body > div', # Project list / "No projects" - '#components': 'div > div:nth-of-type(2) > div > div.panel-body > div', # Component list / "No components" - 'body h2': '' # Activity points, project count - } - - -class InstitutionDashboardVerifier(Verifier): - def __init__(self): - super().__init__(350, InstitutionDashboardPage, "") - self.loading_elements = { - '#fileBrowser > div.db-main > div.line-loader > div.load-message': '.fg-file-links' # "loading" / Project browser - } - self.alternate_elements = { - '#fileBrowser > div.db-infobar > div > div': '#fileBrowser > div.db-infobar > div > div' # Project preview / "Select a project" - } +# Called when json file had scrape_nodes = true +def verify_nodes(verification_dictionary, list_name, first_run): + """ + Called when scrape_nodes = True + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: nodes_list_failed_verification: List of all the node urls that need to be rescraped. + """ + projects_verifier = Verifier() + projects_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + nodes_list_failed_verification = projects_verifier.failed_pages -# Called when json file had scrape_nodes = true -# Checks for all the components of a project and if they were scraped -# Verifies them and returns a list of the failed pages -def verify_nodes(verification_dictionary, list_name): - nodes_list_verified = [] - if verification_dictionary['include_files']: - project_files_verifier = ProjectFilesVerifier() - project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_files = project_files_verifier.failed_pages - nodes_list_verified += project_files - if verification_dictionary['include_wiki']: - project_wiki_verifier = ProjectWikiVerifier() - project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_wiki = project_wiki_verifier.failed_pages - nodes_list_verified += project_wiki - if verification_dictionary['include_analytics']: - project_analytics_verifier = ProjectAnalyticsVerifier() - project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_analytics = project_analytics_verifier.failed_pages - nodes_list_verified += project_analytics - if verification_dictionary['include_registrations']: - project_registrations_verifier = ProjectRegistrationsVerifier() - project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_registrations = project_registrations_verifier.failed_pages - nodes_list_verified += project_registrations - if verification_dictionary['include_forks']: - project_forks_verifier = ProjectForksVerifier() - project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_forks = project_forks_verifier.failed_pages - nodes_list_verified += project_forks - if verification_dictionary['include_dashboard']: # This must go last because its URLs don't have a specific ending. - project_dashboards_verifier = ProjectDashboardVerifier() - project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - project_dashboards = project_dashboards_verifier.failed_pages - nodes_list_verified += project_dashboards - return nodes_list_verified + return nodes_list_failed_verification # Called when json file had scrape_registrations = true -# Verifies the components of a registration and returns a list of the failed pages -def verify_registrations(verification_dictionary, list_name): +def verify_registrations(verification_dictionary, list_name, first_run): + """ + Called when scrape_registrations = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: registrations_list_failed_verification: List of all the registration urls that need to be rescraped. + """ # Must run all page types automatically - registration_files_verifier = RegistrationFilesVerifier() - registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - registration_files = registration_files_verifier.failed_pages - - registration_wiki_verifier = RegistrationWikiVerifier() - registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - registration_wiki = registration_wiki_verifier.failed_pages - - registration_analytics_verifier = RegistrationAnalyticsVerifier() - registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - registration_analytics = registration_analytics_verifier.failed_pages - - registration_forks_verifier = RegistrationForksVerifier() - registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - registration_forks = registration_forks_verifier.failed_pages - - registration_dashboards_verifier = RegistrationDashboardVerifier() - registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - registration_dashboards = registration_dashboards_verifier.failed_pages + registrations_verifier = Verifier() + registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + registrations_list_failed_verification = registrations_verifier.failed_pages - registrations_list_verified = registration_files + registration_wiki + registration_analytics + \ - registration_forks + registration_dashboards - return registrations_list_verified + return registrations_list_failed_verification # Called when json file had scrape_users = true # Verifies all user profile pages and returns a list of the failed pages -def verify_users(verification_dictionary, list_name): - user_profiles_verifier = UserProfileVerifier() - user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - user_profiles = user_profiles_verifier.failed_pages - return user_profiles +def verify_users(verification_dictionary, list_name, first_run): + """ + Called when scrape_users = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: user_profiles_failed_verification: List of all the user urls that need to be rescraped. + """ + user_profiles_verifier = Verifier() + user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + user_profiles_failed_verification = user_profiles_verifier.failed_pages + return user_profiles_failed_verification # Called when json file had scrape_institutions = true # Verifies all user profile pages and returns a list of the failed pages -def verify_institutions(verification_dictionary, list_name): - institution_dashboards_verifier = InstitutionDashboardVerifier() - institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) - institution_dashboards = institution_dashboards_verifier.failed_pages - return institution_dashboards - - -def call_rescrape(json_dictionary, verification_json_dictionary): +def verify_institutions(verification_dictionary, list_name, first_run): + """ + Called when scrape_institutions = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: institutions_dashboards_failed_verification: List of all the institution urls that need to be rescraped. + """ + institution_dashboards_verifier = Verifier() + institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + institution_dashboards_failed_verification = institution_dashboards_verifier.failed_pages + return institution_dashboards_failed_verification + + +def call_rescrape(verification_dictionary): + """ + Rescrapes all urls that failed verification + Creates an instance of the crawler and calls scrape_pages on all urls dumped into 'error_list' in the json file + + :param verification_dictionary: The dictionary created from the json file. + """ print("Called rescrape.") second_chance = Crawler() - if json_dictionary['scrape_nodes']: - second_chance.node_urls = verification_json_dictionary['node_urls_failed_verification'] - second_chance.scrape_nodes(async=True) - if json_dictionary['scrape_registrations']: - second_chance.registration_urls = verification_json_dictionary['registration_urls_failed_verification'] - second_chance.scrape_registrations(async=True) - if json_dictionary['scrape_users']: - second_chance.user_urls = verification_json_dictionary['user_urls_failed_verification'] - second_chance.scrape_users() - if json_dictionary['scrape_institutions']: - second_chance.institution_urls = verification_json_dictionary['institution_urls_failed_verification'] - second_chance.scrape_institutions() + second_chance.scrape_pages(verification_dictionary['error_list']) + +def setup_verification(json_dictionary, first_run): + """ + Specified which lists in the json task file need to be read from based on conditions specified in the json task + file. Also, if its after the first run of verification all urls to be verified are read from error_list. -def setup_verification(json_dictionary, verification_json_dictionary, first_scrape): + :param json_dictionary: The dictionary created from the json file. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: failed_verification_list: List of all the urls that need to be rescraped. + + """ + failed_verification_list = [] print("Check verification") if json_dictionary['scrape_nodes']: - if first_scrape: + if first_run: list_name = 'node_urls' else: - list_name = 'node_urls_failed_verification' - verification_json_dictionary['node_urls_failed_verification'] = verify_nodes(json_dictionary, list_name) + list_name = 'error_list' + failed_verification_list += verify_nodes(json_dictionary, list_name, first_run) if json_dictionary['scrape_registrations']: - if first_scrape: + if first_run: list_name = 'registration_urls' else: - list_name = 'registration_urls_failed_verification' - verification_json_dictionary['registration_urls_failed_verification'] = verify_registrations(json_dictionary, - list_name) + list_name = 'error_list' + failed_verification_list += verify_registrations(json_dictionary, list_name, first_run) if json_dictionary['scrape_users']: - if first_scrape: + if first_run: list_name = 'user_urls' else: - list_name = 'user_urls_failed_verification' - verification_json_dictionary['user_urls_failed_verification'] = \ - verify_users(json_dictionary, list_name) + list_name = 'error_list' + failed_verification_list += verify_users(json_dictionary, list_name, first_run) if json_dictionary['scrape_institutions']: - if first_scrape: + if first_run: list_name = 'institution_urls' else: - list_name = 'institution_urls_failed_verification' - verification_json_dictionary['institution_urls_failed_verification'] = verify_institutions(json_dictionary, - list_name) - - -def run_verification(json_file, i): + list_name = 'error_list' + failed_verification_list += verify_institutions(json_dictionary, list_name, first_run) + + return failed_verification_list + + +def run_verification(json_file, retry_number): + """ + CLI Endpoint for a normal run of verification. + Controls the main workflow of verification. + Two copies of the json task file are opened. One to preserve the original lists of urls to be verified, + and one to alter to dump all urls to be rescraped into. + On the first run of verification, certain conditions in the json file are checked to determine what lists + in the json file to read from based on what was scraped. An additional condition is added to the json file + when the first run of verification is finished to specify that all subsequent runs of verification need only + read from and dump to the list 'error_list'. + + :param json_file: Name of the json task file. + :param retry_number: Number of what iteration of verification is being run. + """ with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_info = json.load(failure_file) with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_copy = json.load(failure_file) - if i == 0: + if retry_number == 0: print("Begun 1st run") if run_info['scrape_finished']: - setup_verification(run_info, run_copy, True) + run_copy['error_list'] = setup_verification(run_info, True) run_copy['1st_verification_finished'] = True with codecs.open(json_file, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) print("Dumped json run_copy 1st verify") - call_rescrape(run_info, run_copy) + call_rescrape(run_copy) else: print("Begun next run") - setup_verification(run_copy, run_copy, False) + run_copy['error_list'] = setup_verification(run_copy, False) # truncates json and dumps new lists with codecs.open(json_file, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) - call_rescrape(run_copy, run_copy) + call_rescrape(run_copy) -def resume_verification(json_filename): - with codecs.open(json_filename, mode='r', encoding='utf-8') as failure_file: - run_copy = json.load(failure_file) - print("Resumed verification.") - setup_verification(run_copy, run_info, False) - # truncates json and dumps new lists - with codecs.open(json_filename, mode='w', encoding='utf-8') as file: - json.dump(run_copy, file, indent=4) - call_rescrape(run_copy, run_copy) - +def resume_verification(json_file): + """ + CLI Endpoint for resuming interrupted verification -def main(json_filename, num_retries): - # For testing: - # num_retries = 2 - # call two verification/scraping methods depending on num retries - run_verification(json_filename, num_retries) + :param json_file: The dictionary created from the json file. + """ + with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: + run_copy = json.load(failure_file) + print("Resumed verification.") + run_copy['error_list'] = setup_verification(run_copy, False) + # truncates json and dumps new lists + with codecs.open(json_file, mode='w', encoding='utf-8') as file: + json.dump(run_copy, file, indent=4) + call_rescrape(run_copy)