-
Notifications
You must be signed in to change notification settings - Fork 5
Feature/shrink #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Feature/shrink #85
Changes from all commits
7e8ed8f
d36791f
54a33a3
adfd500
6395e42
40bae81
8a3ed3b
60f8865
d279f93
d50fa48
872d914
c0bef3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,8 @@ | ||
| # General business | ||
| .idea/ | ||
| __pycache__/ | ||
| .env/ | ||
| .venv/ | ||
| .env/ | ||
|
|
||
| # Mirror folders | ||
| localhost:70/ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| """ | ||
| Currently, Prerender injects 1000s of lines of CSS into each file. Shrink utility replaces this with a CSS | ||
| link to a file of all the stuff that was once there. | ||
|
|
||
| This script renews the consolidated CSS file with the stuff the OSF pages of tomorrow have injected into them. | ||
| """ | ||
| from bs4 import BeautifulSoup | ||
| import requests | ||
|
|
||
| # Reading from HTTP request vs. from a file | ||
| REMOTE = True | ||
|
|
||
| # URLs to use if REMOTE. Any example of each type of page will do, really | ||
| page_urls = { | ||
| 'https://osf.io/2ngdw/files': 'Project Files', | ||
| 'https://osf.io/2ngdw/wiki': 'Project Wiki', | ||
| 'https://osf.io/2ngdw/analytics': 'Project Analytics', | ||
| 'https://osf.io/2ngdw/forks': 'Project Forks', | ||
| 'https://osf.io/2ngdw/registrations': 'Project Registrations', | ||
| 'https://osf.io/c7vbx': 'User', | ||
| 'https://osf.io/institutions/cos': 'Institution' | ||
| } | ||
|
|
||
| # Local filename if not REMOTE | ||
| FILENAME = '' | ||
| # Remove style element from the local file after scraping | ||
| SHRINK_FILE = True | ||
|
|
||
| # Output CSS file | ||
| CSS_FILEPATH = '../archive/static/consolidated.css' | ||
|
|
||
|
|
||
| giblets = [] | ||
| css = open(CSS_FILEPATH, 'w') | ||
|
|
||
|
|
||
| # Remove style tags and put the content in the consolidated file | ||
| def scrape_css(html, css_file): | ||
|
|
||
| soup = BeautifulSoup(html, 'html.parser') | ||
|
|
||
| for elem in soup.findAll('style'): | ||
| giblet = elem.text | ||
| if giblet not in giblets: | ||
| giblets.append(elem.text) | ||
| elem.extract() | ||
|
|
||
| link_tag = soup.new_tag("link", href=CSS_FILEPATH, rel="stylesheet") | ||
| if soup.head is not None: | ||
| soup.head.insert(0, link_tag) | ||
|
|
||
| return str(soup) | ||
|
|
||
|
|
||
| # Use a website as a base to copy CSS | ||
| def stream_html(url): | ||
| header = { | ||
| "User-agent": "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)"} | ||
|
|
||
| html = requests.Session().get(url, headers=header).text | ||
| scrape_css(html, css) | ||
|
|
||
|
|
||
| # Use a file to copy CSS and remove it from the file | ||
| def open_html(path): | ||
| file = open(path, 'r+') | ||
| new_html = scrape_css(file, css) | ||
|
|
||
| if SHRINK_FILE: | ||
| file.seek(0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. Seek(0), truncate() and then write() seems to be better. |
||
| file.write(new_html) | ||
| file.truncate() | ||
|
|
||
| file.close() | ||
|
|
||
|
|
||
| def main(): | ||
|
|
||
| if REMOTE: | ||
| # Go through every page type | ||
| for page in page_urls: | ||
| print("Extracting", page_urls[page], page) | ||
| stream_html(page) | ||
|
|
||
| else: | ||
| open_html(FILENAME) | ||
|
|
||
| for block in giblets: | ||
| css.write(block + '\n') | ||
|
|
||
| css.close() | ||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| import json, codecs | ||
| from bs4 import BeautifulSoup | ||
| import tqdm | ||
|
|
||
|
|
||
| class Shrinker: | ||
| """ | ||
| Shrinker visits every file in the JSON taskfile and removes any <style> tag | ||
| and inserts one link to static/css/consolidated.css | ||
|
|
||
| update_css.py generates this consolidated file. | ||
| """ | ||
| def __init__(self, tf): | ||
| with codecs.open(tf, mode='r', encoding='utf-8') as file: | ||
| self.run_info = json.load(file) | ||
|
|
||
| # Go through each section of the taskfile, generate file paths, and replace the CSS in each file | ||
| def run(self): | ||
| if self.run_info['scrape_nodes']: | ||
| print("Downsizing nodes") | ||
| self.nodes = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['node_urls']] | ||
| for node in tqdm.tqdm(self.nodes): | ||
| self._replace_css(node) | ||
| if self.run_info['scrape_registrations']: | ||
| print("Downsizing registrations") | ||
| self.registrations = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['registration_urls']] | ||
| for registration in tqdm.tqdm(self.registrations): | ||
| self._replace_css(registration) | ||
| if self.run_info['scrape_users']: | ||
| print("Downsizing users") | ||
| self.users = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['user_urls']] | ||
| for user in tqdm.tqdm(self.users): | ||
| self._replace_css(user) | ||
| if self.run_info['scrape_institutions']: | ||
| print("Downsizing institutions") | ||
| self.institutions = ['archive/' + '/'.join(url.split('/')[3:]) + 'index.html' for url in self.run_info['institution_urls']] | ||
| for institution in tqdm.tqdm(self.institutions): | ||
| self._replace_css(institution) | ||
|
|
||
| # Create a BS4 object of a file, remove any <style> results, and insert <link rel=stylesheet> | ||
| def _replace_css(self, path): | ||
| try: | ||
| file = open(path, "r+") | ||
| soup = BeautifulSoup(file, 'html.parser') | ||
| for elem in soup.findAll('style'): | ||
| elem.extract() | ||
| link_tag = soup.new_tag("link", href="/static/consolidated.css", rel="stylesheet") | ||
| if soup.head is not None: | ||
| soup.head.insert(0, link_tag) | ||
| # soup = soup.prettify() | ||
| file.seek(0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this trying to erase the file and write again? If so, seek(0), truncate() and then write() would make more sense. |
||
| file.write(str(soup)) | ||
| file.truncate() | ||
| file.close() | ||
| except FileNotFoundError: | ||
| return | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we do not have to update the CSS through scraping the OSF. If we want to shrink, we can get the CSS from the pages we want to shrink. This seems unnecessary.