CenterForOpenScience · zamattiac · Aug 1, 2016 · Aug 1, 2016 · Aug 2, 2016 · Aug 2, 2016
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,8 @@
 # General business
 .idea/
 __pycache__/
-.env/
 .venv/
+.env/
 
 # Mirror folders
 localhost:70/

diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Navigate into the new folder ( `cd ROSIEBot` )
 - Resuming: continuing the crawl/scrape process if it stops in the middle
 - Verifying: making sure all the files are present and in acceptable condition
 - Compiling active: getting a list from the API about existing pages
+- Shrinking: reducing size of the mirror by removing redundant CSS
 
 
 ## Using the Command Line Interface
@@ -131,6 +132,12 @@ Remove anything inside a category folder that isn't listed on the API. Requires
 
 `python cli.py --delete --ctf=<TASKFILE>`
 
+#### `--shrink`
+
+Remove redundant CSS from the files and replace it with a link to a consolidated file in the static folder. Requires the `--tf=` flag.
+
+To create the consolidated CSS file, run `python -m scripts.update_css`. This is required!
+
 #### `--index`
 
 Creates a search engine index. 

diff --git a/cli.py b/cli.py
@@ -5,6 +5,8 @@
 import codecs
 import verifier
 import deleter
+import shrinker
+import os
 
 # Endpoint for using the ROSIEBot module via command line.
 
@@ -19,6 +21,7 @@
                                                     "import task file")
 @click.option('--delete', is_flag=True, help="Delete nodes from the mirror that have been deleted by users. Requires "
                                              "compile_active-produced active-node taskfile")
+@click.option('--shrink', is_flag=True, help="Reduce the mirror size by eliminating redundant CSS, need to import task file")
 # Specify parameters for other needed values
 @click.option('--dm', default=None, type=click.STRING, help="Date marker needed for normal scrape")
 @click.option('--tf', default=None, type=click.STRING, help="filename of the task file")
@@ -37,12 +40,14 @@
 @click.option('-a', is_flag=True, help="Add this flag if you want to include analytics page for nodes")
 @click.option('-r', is_flag=True, help="Add this flag if you want to include registrations page for nodes")
 @click.option('-k', is_flag=True, help="Add this flag if you want to include forks page for nodes")
-def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, dm, tf, rn, ctf, registrations,
-                    users, institutions, nodes, d, f, w, a, r, k):
-
+def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, shrink, dm, tf, rn, ptf, ctf, registrations, users,
+                    institutions, nodes, d, f, w, a, r, k):
     # Check to see if more than one option is chosen.
     if sum(map(bool, [scrape, resume, verify, resume_verify, compile_active, delete])) != 1:
         click.echo("Invalid options. Please select only one mode.")
+
+    if (resume or verify or resume_verify or shrink) and tf is None:
+        click.echo("This mode requires a task file in the form: --tf=<FILENAME>")
         return
 
     if compile_active:
@@ -69,10 +74,7 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
         click.echo("Use `python cli.py --verify --tf={}` to fix any missing or incomplete pages".format(filename))
         return
 
-    if resume and tf is None:
-        click.echo("Need a task file to resume a scrape")
-
-    if resume and tf is not None:
+    if resume:
         click.echo('Resuming scrape with the task file : ' + tf)
         try:
             with codecs.open(tf, 'r', encoding='utf-8') as db:
@@ -81,20 +83,14 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
             click.echo('File Not Found for the task.')
         return
 
-    if verify and tf is None:
-        click.echo("Need a task file to verify a mirror")
-
-    if verify and tf is not None:
+    if verify:
         try:
             verify_mirror(tf, rn)
         except FileNotFoundError:
             click.echo('File Not Found for the task.')
         return
 
-    if resume_verify and tf is None:
-        click.echo("Need a task file to resume verification")
-
-    if resume_verify and tf is not None:
+    if resume_verify:
         try:
             resume_verify_mirror(tf, rn)
         except FileNotFoundError:
@@ -105,7 +101,13 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
         try:
             delete_nodes(ctf)
         except FileNotFoundError:
-            click.echo("The json file of currently active nodes was not found.")
+            click.echo("Either the json file of previously active nodes was not found or the json file of currently "
+                       "active nodes was not found.")
+        return
+
+    if shrink:
+        shrink_size(tf)
+        return
 
     return
 
@@ -370,5 +372,22 @@ def delete_nodes(ctf):
     macc = deleter.Deleter(ctf)
     macc.run()
 
+
+def get_folder_size(folder):
+    folder_size = 0
+    for (path, dirs, files) in os.walk(folder):
+        for file in files:
+            filename = os.path.join(path, file)
+            folder_size += os.path.getsize(filename)
+    return "%0.1f MB" % (folder_size / (1024 * 1024.0))
+
+
+def shrink_size(tf):
+    click.echo("Beginning size: " + get_folder_size("archive"))
+    freud = shrinker.Shrinker(tf)
+    freud.run()
+    click.echo("New size: " + get_folder_size("archive"))
+    return
+
 if __name__ == '__main__':
     cli_entry_point()
diff --git a/scripts/update_css.py b/scripts/update_css.py
@@ -0,0 +1,94 @@
+"""
+Currently, Prerender injects 1000s of lines of CSS into each file. Shrink utility replaces this with a CSS
+link to a file of all the stuff that was once there.
+
+This script renews the consolidated CSS file with the stuff the OSF pages of tomorrow have injected into them.
+"""
+from bs4 import BeautifulSoup
+import requests
+
+# Reading from HTTP request vs. from a file
+REMOTE = True
+
+# URLs to use if REMOTE. Any example of each type of page will do, really
+page_urls = {
+    'https://osf.io/2ngdw/files': 'Project Files',
+    'https://osf.io/2ngdw/wiki': 'Project Wiki',
+    'https://osf.io/2ngdw/analytics': 'Project Analytics',
+    'https://osf.io/2ngdw/forks': 'Project Forks',
+    'https://osf.io/2ngdw/registrations': 'Project Registrations',
+    'https://osf.io/c7vbx': 'User',
+    'https://osf.io/institutions/cos': 'Institution'
+}
+
+# Local filename if not REMOTE
+FILENAME = ''
+# Remove style element from the local file after scraping
+SHRINK_FILE = True
+
+# Output CSS file
+CSS_FILEPATH = '../archive/static/consolidated.css'
+
+
+giblets = []
+css = open(CSS_FILEPATH, 'w')
+
+
+# Remove style tags and put the content in the consolidated file
+def scrape_css(html, css_file):
+
+    soup = BeautifulSoup(html, 'html.parser')
+
+    for elem in soup.findAll('style'):
+        giblet = elem.text
+        if giblet not in giblets:
+            giblets.append(elem.text)
+        elem.extract()
+
+    link_tag = soup.new_tag("link", href=CSS_FILEPATH, rel="stylesheet")
+    if soup.head is not None:
+        soup.head.insert(0, link_tag)
+
+    return str(soup)
+
+
+# Use a website as a base to copy CSS
+def stream_html(url):
+    header = {
+        "User-agent": "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)"}
+
+    html = requests.Session().get(url, headers=header).text
+    scrape_css(html, css)
+
+
+# Use a file to copy CSS and remove it from the file
+def open_html(path):
+    file = open(path, 'r+')
+    new_html = scrape_css(file, css)
+
+    if SHRINK_FILE:
+        file.seek(0)
+        file.write(new_html)
+        file.truncate()
+
+    file.close()
+
+
+def main():
+
+    if REMOTE:
+        # Go through every page type
+        for page in page_urls:
+            print("Extracting", page_urls[page], page)
+            stream_html(page)
+
+    else:
+        open_html(FILENAME)
+
+    for block in giblets:
+        css.write(block + '\n')
+
+    css.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/shrinker.py b/shrinker.py
@@ -0,0 +1,56 @@
+import json, codecs
+from bs4 import BeautifulSoup
+import tqdm
+
+
+class Shrinker:
+    """
+    Shrinker visits every file in the JSON taskfile and removes any <style> tag
+    and inserts one link to static/css/consolidated.css
+
+    update_css.py generates this consolidated file.
+    """
+    def __init__(self, tf):
+        with codecs.open(tf, mode='r', encoding='utf-8') as file:
+            self.run_info = json.load(file)
+
+    #  Go through each section of the taskfile, generate file paths, and replace the CSS in each file
+    def run(self):
+        if self.run_info['scrape_nodes']:
+            print("Downsizing nodes")
+            self.nodes = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['node_urls']]
+            for node in tqdm.tqdm(self.nodes):
+                self._replace_css(node)
+        if self.run_info['scrape_registrations']:
+            print("Downsizing registrations")
+            self.registrations = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['registration_urls']]
+            for registration in tqdm.tqdm(self.registrations):
+                self._replace_css(registration)
+        if self.run_info['scrape_users']:
+            print("Downsizing users")
+            self.users = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['user_urls']]
+            for user in tqdm.tqdm(self.users):
+                self._replace_css(user)
+        if self.run_info['scrape_institutions']:
+            print("Downsizing institutions")
+            self.institutions = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['institution_urls']]
+            for institution in tqdm.tqdm(self.institutions):
+                self._replace_css(institution)
+
+    # Create a BS4 object of a file, remove any <style> results, and insert <link rel=stylesheet>
+    def _replace_css(self, path):
+        try:
+            file = open(path, "r+")
+            soup = BeautifulSoup(file, 'html.parser')
+            for elem in soup.findAll('style'):
+                elem.extract()
+            link_tag = soup.new_tag("link", href="/static/consolidated.css", rel="stylesheet")
+            if soup.head is not None:
+                soup.head.insert(0, link_tag)
+            # soup = soup.prettify()
+            file.seek(0)
+            file.write(str(soup))
+            file.truncate()
+            file.close()
+        except FileNotFoundError:
+            return