Skip to content
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# General business
.idea/
__pycache__/
.env/
.venv/
.env/

# Mirror folders
localhost:70/
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Navigate into the new folder ( `cd ROSIEBot` )
- Resuming: continuing the crawl/scrape process if it stops in the middle
- Verifying: making sure all the files are present and in acceptable condition
- Compiling active: getting a list from the API about existing pages
- Shrinking: reducing size of the mirror by removing redundant CSS


## Using the Command Line Interface
Expand Down Expand Up @@ -131,6 +132,12 @@ Remove anything inside a category folder that isn't listed on the API. Requires

`python cli.py --delete --ctf=<TASKFILE>`

#### `--shrink`

Remove redundant CSS from the files and replace it with a link to a consolidated file in the static folder. Requires the `--tf=` flag.

To create the consolidated CSS file, run `python -m scripts.update_css`. This is required!

#### `--index`

Creates a search engine index.
Expand Down
51 changes: 35 additions & 16 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import codecs
import verifier
import deleter
import shrinker
import os

# Endpoint for using the ROSIEBot module via command line.

Expand All @@ -19,6 +21,7 @@
"import task file")
@click.option('--delete', is_flag=True, help="Delete nodes from the mirror that have been deleted by users. Requires "
"compile_active-produced active-node taskfile")
@click.option('--shrink', is_flag=True, help="Reduce the mirror size by eliminating redundant CSS, need to import task file")
# Specify parameters for other needed values
@click.option('--dm', default=None, type=click.STRING, help="Date marker needed for normal scrape")
@click.option('--tf', default=None, type=click.STRING, help="filename of the task file")
Expand All @@ -37,12 +40,14 @@
@click.option('-a', is_flag=True, help="Add this flag if you want to include analytics page for nodes")
@click.option('-r', is_flag=True, help="Add this flag if you want to include registrations page for nodes")
@click.option('-k', is_flag=True, help="Add this flag if you want to include forks page for nodes")
def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, dm, tf, rn, ctf, registrations,
users, institutions, nodes, d, f, w, a, r, k):

def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delete, shrink, dm, tf, rn, ptf, ctf, registrations, users,
institutions, nodes, d, f, w, a, r, k):
# Check to see if more than one option is chosen.
if sum(map(bool, [scrape, resume, verify, resume_verify, compile_active, delete])) != 1:
click.echo("Invalid options. Please select only one mode.")

if (resume or verify or resume_verify or shrink) and tf is None:
click.echo("This mode requires a task file in the form: --tf=<FILENAME>")
return

if compile_active:
Expand All @@ -69,10 +74,7 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
click.echo("Use `python cli.py --verify --tf={}` to fix any missing or incomplete pages".format(filename))
return

if resume and tf is None:
click.echo("Need a task file to resume a scrape")

if resume and tf is not None:
if resume:
click.echo('Resuming scrape with the task file : ' + tf)
try:
with codecs.open(tf, 'r', encoding='utf-8') as db:
Expand All @@ -81,20 +83,14 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
click.echo('File Not Found for the task.')
return

if verify and tf is None:
click.echo("Need a task file to verify a mirror")

if verify and tf is not None:
if verify:
try:
verify_mirror(tf, rn)
except FileNotFoundError:
click.echo('File Not Found for the task.')
return

if resume_verify and tf is None:
click.echo("Need a task file to resume verification")

if resume_verify and tf is not None:
if resume_verify:
try:
resume_verify_mirror(tf, rn)
except FileNotFoundError:
Expand All @@ -105,7 +101,13 @@ def cli_entry_point(scrape, resume, verify, resume_verify, compile_active, delet
try:
delete_nodes(ctf)
except FileNotFoundError:
click.echo("The json file of currently active nodes was not found.")
click.echo("Either the json file of previously active nodes was not found or the json file of currently "
"active nodes was not found.")
return

if shrink:
shrink_size(tf)
return

return

Expand Down Expand Up @@ -370,5 +372,22 @@ def delete_nodes(ctf):
macc = deleter.Deleter(ctf)
macc.run()


def get_folder_size(folder):
folder_size = 0
for (path, dirs, files) in os.walk(folder):
for file in files:
filename = os.path.join(path, file)
folder_size += os.path.getsize(filename)
return "%0.1f MB" % (folder_size / (1024 * 1024.0))


def shrink_size(tf):
click.echo("Beginning size: " + get_folder_size("archive"))
freud = shrinker.Shrinker(tf)
freud.run()
click.echo("New size: " + get_folder_size("archive"))
return

if __name__ == '__main__':
cli_entry_point()
94 changes: 94 additions & 0 deletions scripts/update_css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Currently, Prerender injects 1000s of lines of CSS into each file. Shrink utility replaces this with a CSS
link to a file of all the stuff that was once there.

This script renews the consolidated CSS file with the stuff the OSF pages of tomorrow have injected into them.
"""
from bs4 import BeautifulSoup
import requests

# Reading from HTTP request vs. from a file
REMOTE = True

# URLs to use if REMOTE. Any example of each type of page will do, really
page_urls = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we do not have to update the CSS through scraping the OSF. If we want to shrink, we can get the CSS from the pages we want to shrink. This seems unnecessary.

'https://osf.io/2ngdw/files': 'Project Files',
'https://osf.io/2ngdw/wiki': 'Project Wiki',
'https://osf.io/2ngdw/analytics': 'Project Analytics',
'https://osf.io/2ngdw/forks': 'Project Forks',
'https://osf.io/2ngdw/registrations': 'Project Registrations',
'https://osf.io/c7vbx': 'User',
'https://osf.io/institutions/cos': 'Institution'
}

# Local filename if not REMOTE
FILENAME = ''
# Remove style element from the local file after scraping
SHRINK_FILE = True

# Output CSS file
CSS_FILEPATH = '../archive/static/consolidated.css'


giblets = []
css = open(CSS_FILEPATH, 'w')


# Remove style tags and put the content in the consolidated file
def scrape_css(html, css_file):

soup = BeautifulSoup(html, 'html.parser')

for elem in soup.findAll('style'):
giblet = elem.text
if giblet not in giblets:
giblets.append(elem.text)
elem.extract()

link_tag = soup.new_tag("link", href=CSS_FILEPATH, rel="stylesheet")
if soup.head is not None:
soup.head.insert(0, link_tag)

return str(soup)


# Use a website as a base to copy CSS
def stream_html(url):
header = {
"User-agent": "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)"}

html = requests.Session().get(url, headers=header).text
scrape_css(html, css)


# Use a file to copy CSS and remove it from the file
def open_html(path):
file = open(path, 'r+')
new_html = scrape_css(file, css)

if SHRINK_FILE:
file.seek(0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. Seek(0), truncate() and then write() seems to be better.

file.write(new_html)
file.truncate()

file.close()


def main():

if REMOTE:
# Go through every page type
for page in page_urls:
print("Extracting", page_urls[page], page)
stream_html(page)

else:
open_html(FILENAME)

for block in giblets:
css.write(block + '\n')

css.close()

if __name__ == '__main__':
main()
56 changes: 56 additions & 0 deletions shrinker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json, codecs
from bs4 import BeautifulSoup
import tqdm


class Shrinker:
"""
Shrinker visits every file in the JSON taskfile and removes any <style> tag
and inserts one link to static/css/consolidated.css

update_css.py generates this consolidated file.
"""
def __init__(self, tf):
with codecs.open(tf, mode='r', encoding='utf-8') as file:
self.run_info = json.load(file)

# Go through each section of the taskfile, generate file paths, and replace the CSS in each file
def run(self):
if self.run_info['scrape_nodes']:
print("Downsizing nodes")
self.nodes = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['node_urls']]
for node in tqdm.tqdm(self.nodes):
self._replace_css(node)
if self.run_info['scrape_registrations']:
print("Downsizing registrations")
self.registrations = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['registration_urls']]
for registration in tqdm.tqdm(self.registrations):
self._replace_css(registration)
if self.run_info['scrape_users']:
print("Downsizing users")
self.users = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['user_urls']]
for user in tqdm.tqdm(self.users):
self._replace_css(user)
if self.run_info['scrape_institutions']:
print("Downsizing institutions")
self.institutions = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['institution_urls']]
for institution in tqdm.tqdm(self.institutions):
self._replace_css(institution)

# Create a BS4 object of a file, remove any <style> results, and insert <link rel=stylesheet>
def _replace_css(self, path):
try:
file = open(path, "r+")
soup = BeautifulSoup(file, 'html.parser')
for elem in soup.findAll('style'):
elem.extract()
link_tag = soup.new_tag("link", href="/static/consolidated.css", rel="stylesheet")
if soup.head is not None:
soup.head.insert(0, link_tag)
# soup = soup.prettify()
file.seek(0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this trying to erase the file and write again? If so, seek(0), truncate() and then write() would make more sense.

file.write(str(soup))
file.truncate()
file.close()
except FileNotFoundError:
return