Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,5 @@ target/

# Pyenv name
.python-version

.idea/
28 changes: 26 additions & 2 deletions htmlark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,25 @@
from urllib.parse import urlparse

import bs4
from bs4 import Tag

# Import requests if available, dummy it if not
try:
from requests import get as requests_get
from requests import RequestException


except ImportError:
requests_get = None

class RequestException(Exception): # NOQA make flake8 shut up
"""Dummy exception for when Requests is not installed."""
pass

headers = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

PARSERS = ['lxml', 'html5lib', 'html.parser']


Expand Down Expand Up @@ -56,7 +64,7 @@ def _get_resource(resource_url: str) -> (str, bytes):
if url_parsed.scheme in ['http', 'https']:
# Requests might not be installed
if requests_get is not None:
request = requests_get(resource_url)
request = requests_get(resource_url, headers=headers)
data = request.content
if 'Content-Type' in request.headers:
mimetype = request.headers['Content-Type']
Expand Down Expand Up @@ -186,6 +194,7 @@ def convert_page(page_path: str, parser: str='auto',
# Gather all the relevant tags together
if not ignore_images:
tags += soup('img')
tags += soup('svg')
if not ignore_css:
csstags = soup('link')
for css in csstags:
Expand All @@ -199,11 +208,26 @@ def convert_page(page_path: str, parser: str='auto',

# Convert the linked resources
for tag in tags:
tag_url = tag['href'] if tag.name == 'link' else tag['src']
tag_url = ''

if tag.name.lower() == 'svg':
for element in tag.contents:
if type(element) is Tag and element.name.lower() == 'image':
image_tag = soup.new_tag('img', src=element['src'])
tag.replace_with(image_tag)
tag = image_tag

tag_url = tag['href'] if tag.name == 'link' else tag['src']
else:
continue
else:
tag_url = tag['href'] if tag.name == 'link' else tag['src']

try:
# BUG: doesn't work if using relative remote URLs in a local file
fullpath = urljoin(page_path, tag_url)
tag_mime, tag_data = _get_resource(fullpath)

except RequestException:
callback('ERROR', tag.name, "Can't access URL " + fullpath)
if not ignore_errors:
Expand Down
23 changes: 22 additions & 1 deletion tests/test_htmlark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import importlib
import os.path
import unittest

import bs4
import htmlark
from bs4 import Tag

# Check for existance of requests
requests_spec = importlib.util.find_spec('requests')
Expand Down Expand Up @@ -51,3 +52,23 @@ def test_get_resource_errors(self):
htmlark.requests_get = None
with self.assertRaises(NameError):
htmlark._get_resource("http://example.com/not/a/real/path.png")

def test_get_image_element_and_create_dataa_uri(self):
packed_html = htmlark.convert_page('https://www.bbc.co.uk/news/world-africa-51063149', ignore_errors=True)

parser = htmlark.get_available_parsers()[0]

soup = bs4.BeautifulSoup(packed_html, parser)

image_elements = soup('svg')

for image_element in image_elements:
for element in image_element.contents:
if type(element) is Tag:
self.assertTrue(element.name.lower() != 'image')


if __name__ == "__main__":
unittest.main()