Skip to content

Commit 0869a71

Browse files
bpo-41748: Handles unquoted attributes with commas (GH-24072)
* bpo-41748: Adds tests for unquoted attributes with comma * bpo-41748: Handles unquoted attributes with comma * bpo-41748: Addresses review comments * bpo-41748: Addresses review comments * Adds more test cases * Simplifies the regex for handling spaces * bpo-41748: Moves attributes tests under the right class * bpo-41748: Addresses review about duplicate attributes * bpo-41748: Adds NEWS.d entry for this patch (cherry picked from commit 9eb11a1) Co-authored-by: Karl Dubost <[email protected]>
1 parent aab84a5 commit 0869a71

File tree

3 files changed

+59
-37
lines changed

3 files changed

+59
-37
lines changed

Lib/html/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
|"[^"]*" # LIT-enclosed value
4848
|(?!['"])[^>\s]* # bare value
4949
)
50-
(?:\s*,)* # possibly followed by a comma
50+
\s* # possibly followed by a space
5151
)?(?:\s|/(?!>))*
5252
)*
5353
)?

Lib/test/test_htmlparser.py

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -452,42 +452,6 @@ def test_illegal_declarations(self):
452452
self._run_check('<!spacer type="block" height="25">',
453453
[('comment', 'spacer type="block" height="25"')])
454454

455-
def test_with_unquoted_attributes(self):
456-
# see #12008
457-
html = ("<html><body bgcolor=d0ca90 text='181008'>"
458-
"<table cellspacing=0 cellpadding=1 width=100% ><tr>"
459-
"<td align=left><font size=-1>"
460-
"- <a href=/rabota/><span class=en> software-and-i</span></a>"
461-
"- <a href='/1/'><span class=en> library</span></a></table>")
462-
expected = [
463-
('starttag', 'html', []),
464-
('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
465-
('starttag', 'table',
466-
[('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
467-
('starttag', 'tr', []),
468-
('starttag', 'td', [('align', 'left')]),
469-
('starttag', 'font', [('size', '-1')]),
470-
('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
471-
('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
472-
('endtag', 'span'), ('endtag', 'a'),
473-
('data', '- '), ('starttag', 'a', [('href', '/1/')]),
474-
('starttag', 'span', [('class', 'en')]), ('data', ' library'),
475-
('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
476-
]
477-
self._run_check(html, expected)
478-
479-
def test_comma_between_attributes(self):
480-
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
481-
'method="post">', [
482-
('starttag', 'form',
483-
[('action', '/xxx.php?a=1&b=2&'),
484-
(',', None), ('method', 'post')])])
485-
486-
def test_weird_chars_in_unquoted_attribute_values(self):
487-
self._run_check('<form action=bogus|&#()value>', [
488-
('starttag', 'form',
489-
[('action', 'bogus|&#()value')])])
490-
491455
def test_invalid_end_tags(self):
492456
# A collection of broken end tags. <br> is used as separator.
493457
# see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
@@ -773,6 +737,62 @@ def test_end_tag_in_attribute_value(self):
773737
[("href", "http://www.example.org/\">;")]),
774738
("data", "spam"), ("endtag", "a")])
775739

740+
def test_with_unquoted_attributes(self):
741+
# see #12008
742+
html = ("<html><body bgcolor=d0ca90 text='181008'>"
743+
"<table cellspacing=0 cellpadding=1 width=100% ><tr>"
744+
"<td align=left><font size=-1>"
745+
"- <a href=/rabota/><span class=en> software-and-i</span></a>"
746+
"- <a href='/1/'><span class=en> library</span></a></table>")
747+
expected = [
748+
('starttag', 'html', []),
749+
('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
750+
('starttag', 'table',
751+
[('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
752+
('starttag', 'tr', []),
753+
('starttag', 'td', [('align', 'left')]),
754+
('starttag', 'font', [('size', '-1')]),
755+
('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
756+
('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
757+
('endtag', 'span'), ('endtag', 'a'),
758+
('data', '- '), ('starttag', 'a', [('href', '/1/')]),
759+
('starttag', 'span', [('class', 'en')]), ('data', ' library'),
760+
('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
761+
]
762+
self._run_check(html, expected)
763+
764+
def test_comma_between_attributes(self):
765+
# see bpo 41478
766+
# HTMLParser preserves duplicate attributes, leaving the task of
767+
# removing duplicate attributes to a conformant html tree builder
768+
html = ('<div class=bar,baz=asd>' # between attrs (unquoted)
769+
'<div class="bar",baz="asd">' # between attrs (quoted)
770+
'<div class=bar, baz=asd,>' # after values (unquoted)
771+
'<div class="bar", baz="asd",>' # after values (quoted)
772+
'<div class="bar",>' # one comma values (quoted)
773+
'<div class=,bar baz=,asd>' # before values (unquoted)
774+
'<div class=,"bar" baz=,"asd">' # before values (quoted)
775+
'<div ,class=bar ,baz=asd>' # before names
776+
'<div class,="bar" baz,="asd">' # after names
777+
)
778+
expected = [
779+
('starttag', 'div', [('class', 'bar,baz=asd'),]),
780+
('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
781+
('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
782+
('starttag', 'div', [('class', 'bar'), (',', None),
783+
('baz', 'asd'), (',', None)]),
784+
('starttag', 'div', [('class', 'bar'), (',', None)]),
785+
('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
786+
('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
787+
('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
788+
('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
789+
]
790+
self._run_check(html, expected)
791+
792+
def test_weird_chars_in_unquoted_attribute_values(self):
793+
self._run_check('<form action=bogus|&#()value>', [
794+
('starttag', 'form',
795+
[('action', 'bogus|&#()value')])])
776796

777797
if __name__ == "__main__":
778798
unittest.main()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix HTMLParser parsing rules for element attributes containing
2+
commas with spaces. Patch by Karl Dubost.

0 commit comments

Comments
 (0)