diff --git a/.gitignore b/.gitignore index fe9f4e2..7b952ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ # General business .idea/ __pycache__/ -.env/ .venv/ +.env/ # Mirror folders localhost:70/ diff --git a/README.md b/README.md index a8469a1..86ab8b7 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ Navigate into the new folder ( `cd ROSIEBot` ) - Resuming: continuing the crawl/scrape process if it stops in the middle - Verifying: making sure all the files are present and in acceptable condition - Compiling active: getting a list from the API about existing pages +- Shrinking: reducing size of the mirror by removing redundant CSS ## Using the Command Line Interface @@ -131,6 +132,12 @@ Remove anything inside a category folder that isn't listed on the API. Requires `python cli.py --delete --ctf=` +#### `--shrink` + +Remove redundant CSS from the files and replace it with a link to a consolidated file in the static folder. Requires the `--tf=` flag. + +To create the consolidated CSS file, run `python -m scripts.update_css`. This is required! + #### `--index` Creates a search engine index. diff --git a/cli.py b/cli.py index ec05513..fe5de2a 100644 --- a/cli.py +++ b/cli.py @@ -5,6 +5,8 @@ import codecs import verifier import deleter +import shrinker +import os # Endpoint for using the ROSIEBot module via command line. @@ -19,6 +21,7 @@ "import task file") @click.option('--delete', is_flag=True, help="Delete nodes from the mirror that have been deleted by users. Requires " "compile_active-produced active-node taskfile") +@click.option('--shrink', is_flag=True, help="Reduce the mirror size by eliminating redundant CSS, need to import task file") # Specify parameters for other needed values @click.option('--dm', default=None, type=click.STRING, help="Date marker needed for normal scrape") @click.option('--tf', default=None, type=click.STRING, help="filename of the task file") @@ -37,12 +40,14 @@ @click.option('-a', is_flag=True, help="Add this flag if you want to include analytics page for nodes") @click.option('-r', is_flag=True, help="Add this flag if you want to include registrations page for nodes") @click.option('-k', is_flag=True, help="Add this flag if you want to include forks page for nodes") -def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, dm, tf, rn, ctf, registrations, - users, institutions, nodes, d, f, w, a, r, k): - +def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, shrink, dm, tf, rn, ptf, ctf, registrations, users, + institutions, nodes, d, f, w, a, r, k): # Check to see if more than one option is chosen. if sum(map(bool, [scrape, resume, verify, resume_verify, compile_active, delete])) != 1: click.echo("Invalid options. Please select only one mode.") + + if (resume or verify or resume_verify or shrink) and tf is None: + click.echo("This mode requires a task file in the form: --tf=") return if compile_active: @@ -69,10 +74,7 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet click.echo("Use `python cli.py --verify --tf={}` to fix any missing or incomplete pages".format(filename)) return - if resume and tf is None: - click.echo("Need a task file to resume a scrape") - - if resume and tf is not None: + if resume: click.echo('Resuming scrape with the task file : ' + tf) try: with codecs.open(tf, 'r', encoding='utf-8') as db: @@ -81,20 +83,14 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet click.echo('File Not Found for the task.') return - if verify and tf is None: - click.echo("Need a task file to verify a mirror") - - if verify and tf is not None: + if verify: try: verify_mirror(tf, rn) except FileNotFoundError: click.echo('File Not Found for the task.') return - if resume_verify and tf is None: - click.echo("Need a task file to resume verification") - - if resume_verify and tf is not None: + if resume_verify: try: resume_verify_mirror(tf, rn) except FileNotFoundError: @@ -105,7 +101,13 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet try: delete_nodes(ctf) except FileNotFoundError: - click.echo("The json file of currently active nodes was not found.") + click.echo("Either the json file of previously active nodes was not found or the json file of currently " + "active nodes was not found.") + return + + if shrink: + shrink_size(tf) + return return @@ -370,5 +372,22 @@ def delete_nodes(ctf): macc = deleter.Deleter(ctf) macc.run() + +def get_folder_size(folder): + folder_size = 0 + for (path, dirs, files) in os.walk(folder): + for file in files: + filename = os.path.join(path, file) + folder_size += os.path.getsize(filename) + return "%0.1f MB" % (folder_size / (1024 * 1024.0)) + + +def shrink_size(tf): + click.echo("Beginning size: " + get_folder_size("archive")) + freud = shrinker.Shrinker(tf) + freud.run() + click.echo("New size: " + get_folder_size("archive")) + return + if __name__ == '__main__': cli_entry_point() diff --git a/scripts/update_css.py b/scripts/update_css.py new file mode 100644 index 0000000..ef4dd1a --- /dev/null +++ b/scripts/update_css.py @@ -0,0 +1,94 @@ +""" +Currently, Prerender injects 1000s of lines of CSS into each file. Shrink utility replaces this with a CSS +link to a file of all the stuff that was once there. + +This script renews the consolidated CSS file with the stuff the OSF pages of tomorrow have injected into them. +""" +from bs4 import BeautifulSoup +import requests + +# Reading from HTTP request vs. from a file +REMOTE = True + +# URLs to use if REMOTE. Any example of each type of page will do, really +page_urls = { + 'https://osf.io/2ngdw/files': 'Project Files', + 'https://osf.io/2ngdw/wiki': 'Project Wiki', + 'https://osf.io/2ngdw/analytics': 'Project Analytics', + 'https://osf.io/2ngdw/forks': 'Project Forks', + 'https://osf.io/2ngdw/registrations': 'Project Registrations', + 'https://osf.io/c7vbx': 'User', + 'https://osf.io/institutions/cos': 'Institution' +} + +# Local filename if not REMOTE +FILENAME = '' +# Remove style element from the local file after scraping +SHRINK_FILE = True + +# Output CSS file +CSS_FILEPATH = '../archive/static/consolidated.css' + + +giblets = [] +css = open(CSS_FILEPATH, 'w') + + +# Remove style tags and put the content in the consolidated file +def scrape_css(html, css_file): + + soup = BeautifulSoup(html, 'html.parser') + + for elem in soup.findAll('style'): + giblet = elem.text + if giblet not in giblets: + giblets.append(elem.text) + elem.extract() + + link_tag = soup.new_tag("link", href=CSS_FILEPATH, rel="stylesheet") + if soup.head is not None: + soup.head.insert(0, link_tag) + + return str(soup) + + +# Use a website as a base to copy CSS +def stream_html(url): + header = { + "User-agent": "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)"} + + html = requests.Session().get(url, headers=header).text + scrape_css(html, css) + + +# Use a file to copy CSS and remove it from the file +def open_html(path): + file = open(path, 'r+') + new_html = scrape_css(file, css) + + if SHRINK_FILE: + file.seek(0) + file.write(new_html) + file.truncate() + + file.close() + + +def main(): + + if REMOTE: + # Go through every page type + for page in page_urls: + print("Extracting", page_urls[page], page) + stream_html(page) + + else: + open_html(FILENAME) + + for block in giblets: + css.write(block + '\n') + + css.close() + +if __name__ == '__main__': + main() diff --git a/shrinker.py b/shrinker.py new file mode 100644 index 0000000..388f18f --- /dev/null +++ b/shrinker.py @@ -0,0 +1,56 @@ +import json, codecs +from bs4 import BeautifulSoup +import tqdm + + +class Shrinker: + """ + Shrinker visits every file in the JSON taskfile and removes any