From 18cd18ae16995cd08e3fd1f7064fd63288dd590e Mon Sep 17 00:00:00 2001 From: mhibbert Date: Sat, 11 Jan 2020 13:58:13 +0000 Subject: [PATCH 1/5] added support for and test --- .gitignore | 2 ++ htmlark.py | 1 + tests/test_htmlark.py | 20 +++++++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d7f76f9..bf5fb7a 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,5 @@ target/ # Pyenv name .python-version + +.idea/ diff --git a/htmlark.py b/htmlark.py index 02c2918..147f39c 100755 --- a/htmlark.py +++ b/htmlark.py @@ -186,6 +186,7 @@ def convert_page(page_path: str, parser: str='auto', # Gather all the relevant tags together if not ignore_images: tags += soup('img') + tags += soup('image') if not ignore_css: csstags = soup('link') for css in csstags: diff --git a/tests/test_htmlark.py b/tests/test_htmlark.py index 8a1e0fd..db66119 100644 --- a/tests/test_htmlark.py +++ b/tests/test_htmlark.py @@ -3,7 +3,7 @@ import importlib import os.path import unittest - +import bs4 import htmlark # Check for existance of requests @@ -51,3 +51,21 @@ def test_get_resource_errors(self): htmlark.requests_get = None with self.assertRaises(NameError): htmlark._get_resource("http://example.com/not/a/real/path.png") + + def test_get_image_element_and_create_dataa_uri(self): + packed_html = htmlark.convert_page('https://www.bbc.co.uk/news/world-africa-51063149', ignore_errors=True) + + parser = htmlark.get_available_parsers()[0] + + soup = bs4.BeautifulSoup(packed_html, parser) + + image_elements = soup('image') + + for image_element in image_elements: + self.assertTrue("data:image" in image_element['src']) + + +if __name__ == "__main__": + unittest.main() + + From fab2cb70c6d7f92ea1f8e03532e01f98a538698e Mon Sep 17 00:00:00 2001 From: mhibbert Date: Sat, 11 Jan 2020 14:23:29 +0000 Subject: [PATCH 2/5] added support for and test --- htmlark.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/htmlark.py b/htmlark.py index 147f39c..eccdc22 100755 --- a/htmlark.py +++ b/htmlark.py @@ -13,6 +13,8 @@ from urllib.parse import urlparse import bs4 +from bs4 import Tag + # Import requests if available, dummy it if not try: from requests import get as requests_get @@ -186,7 +188,7 @@ def convert_page(page_path: str, parser: str='auto', # Gather all the relevant tags together if not ignore_images: tags += soup('img') - tags += soup('image') + tags += soup('svg') if not ignore_css: csstags = soup('link') for css in csstags: @@ -201,6 +203,10 @@ def convert_page(page_path: str, parser: str='auto', # Convert the linked resources for tag in tags: tag_url = tag['href'] if tag.name == 'link' else tag['src'] + if tag.name.lower() == 'svg': + for element in tag.contents: + if type(element) is Tag: + tag = soup.new_tag('img', src=element['src']) try: # BUG: doesn't work if using relative remote URLs in a local file fullpath = urljoin(page_path, tag_url) From 3e11abf897310b4f379a80c275e89371c6a6d925 Mon Sep 17 00:00:00 2001 From: mhibbert Date: Sat, 11 Jan 2020 14:24:02 +0000 Subject: [PATCH 3/5] added support for and test --- htmlark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/htmlark.py b/htmlark.py index eccdc22..aa9983c 100755 --- a/htmlark.py +++ b/htmlark.py @@ -207,6 +207,7 @@ def convert_page(page_path: str, parser: str='auto', for element in tag.contents: if type(element) is Tag: tag = soup.new_tag('img', src=element['src']) + break try: # BUG: doesn't work if using relative remote URLs in a local file fullpath = urljoin(page_path, tag_url) From 7006c348d8d1fe7f58224ed026332ac485beb198 Mon Sep 17 00:00:00 2001 From: mhibbert Date: Sat, 11 Jan 2020 16:35:43 +0000 Subject: [PATCH 4/5] added support for and test --- htmlark.py | 18 ++++++++++++++---- tests/test_htmlark.py | 7 +++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/htmlark.py b/htmlark.py index aa9983c..9a25d60 100755 --- a/htmlark.py +++ b/htmlark.py @@ -202,16 +202,26 @@ def convert_page(page_path: str, parser: str='auto', # Convert the linked resources for tag in tags: - tag_url = tag['href'] if tag.name == 'link' else tag['src'] + tag_url = '' + if tag.name.lower() == 'svg': for element in tag.contents: - if type(element) is Tag: - tag = soup.new_tag('img', src=element['src']) - break + if type(element) is Tag and element.name.lower() == 'image': + image_tag = soup.new_tag('img', src=element['src']) + tag.replace_with(image_tag) + tag = image_tag + + tag_url = tag['href'] if tag.name == 'link' else tag['src'] + else: + continue + else: + tag_url = tag['href'] if tag.name == 'link' else tag['src'] + try: # BUG: doesn't work if using relative remote URLs in a local file fullpath = urljoin(page_path, tag_url) tag_mime, tag_data = _get_resource(fullpath) + except RequestException: callback('ERROR', tag.name, "Can't access URL " + fullpath) if not ignore_errors: diff --git a/tests/test_htmlark.py b/tests/test_htmlark.py index db66119..c714150 100644 --- a/tests/test_htmlark.py +++ b/tests/test_htmlark.py @@ -5,6 +5,7 @@ import unittest import bs4 import htmlark +from bs4 import Tag # Check for existance of requests requests_spec = importlib.util.find_spec('requests') @@ -59,10 +60,12 @@ def test_get_image_element_and_create_dataa_uri(self): soup = bs4.BeautifulSoup(packed_html, parser) - image_elements = soup('image') + image_elements = soup('svg') for image_element in image_elements: - self.assertTrue("data:image" in image_element['src']) + for element in image_element.contents: + if type(element) is Tag: + self.assertTrue(element.name.lower() != 'image') if __name__ == "__main__": From 83ce0f41e1b5e3a72b9381cf06519a1153c030f2 Mon Sep 17 00:00:00 2001 From: mhibbert Date: Sat, 11 Jan 2020 16:47:23 +0000 Subject: [PATCH 5/5] added support for user agents --- htmlark.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/htmlark.py b/htmlark.py index 9a25d60..31de609 100755 --- a/htmlark.py +++ b/htmlark.py @@ -19,6 +19,8 @@ try: from requests import get as requests_get from requests import RequestException + + except ImportError: requests_get = None @@ -26,6 +28,10 @@ class RequestException(Exception): # NOQA make flake8 shut up """Dummy exception for when Requests is not installed.""" pass +headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" +} + PARSERS = ['lxml', 'html5lib', 'html.parser'] @@ -58,7 +64,7 @@ def _get_resource(resource_url: str) -> (str, bytes): if url_parsed.scheme in ['http', 'https']: # Requests might not be installed if requests_get is not None: - request = requests_get(resource_url) + request = requests_get(resource_url, headers=headers) data = request.content if 'Content-Type' in request.headers: mimetype = request.headers['Content-Type']