bpo-41748: Handles unquoted attributes with commas (GH-24072)

miss-islington · karlcow · web-flow · commit 0869a713f21f · 2021-02-01T12:52:52.000-08:00
* bpo-41748: Adds tests for unquoted attributes with comma * bpo-41748: Handles unquoted attributes with comma * bpo-41748: Addresses review comments * bpo-41748: Addresses review comments * Adds more test cases * Simplifies the regex for handling spaces * bpo-41748: Moves attributes tests under the right class * bpo-41748: Addresses review about duplicate attributes * bpo-41748: Adds NEWS.d entry for this patch (cherry picked from commit 9eb11a1) Co-authored-by: Karl Dubost <karl+github@la-grange.net>
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -47,7 +47,7 @@
           |"[^"]*"                   # LIT-enclosed value
           |(?!['"])[^>\s]*           # bare value
          )
-         (?:\s*,)*                   # possibly followed by a comma
+        \s*                          # possibly followed by a space
        )?(?:\s|/(?!>))*
      )*
    )?
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -452,42 +452,6 @@ def test_illegal_declarations(self):
         self._run_check('<!spacer type="block" height="25">',
                         [('comment', 'spacer type="block" height="25"')])
 
-    def test_with_unquoted_attributes(self):
-        # see #12008
-        html = ("<html><body bgcolor=d0ca90 text='181008'>"
-                "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
-                "<td align=left><font size=-1>"
-                "- <a href=/rabota/><span class=en> software-and-i</span></a>"
-                "- <a href='/1/'><span class=en> library</span></a></table>")
-        expected = [
-            ('starttag', 'html', []),
-            ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
-            ('starttag', 'table',
-                [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
-            ('starttag', 'tr', []),
-            ('starttag', 'td', [('align', 'left')]),
-            ('starttag', 'font', [('size', '-1')]),
-            ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
-            ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
-            ('endtag', 'span'), ('endtag', 'a'),
-            ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
-            ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
-            ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
-        ]
-        self._run_check(html, expected)
-
-    def test_comma_between_attributes(self):
-        self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
-                        'method="post">', [
-                            ('starttag', 'form',
-                                [('action', '/xxx.php?a=1&b=2&'),
-                                 (',', None), ('method', 'post')])])
-
-    def test_weird_chars_in_unquoted_attribute_values(self):
-        self._run_check('<form action=bogus|&#()value>', [
-                            ('starttag', 'form',
-                                [('action', 'bogus|&#()value')])])
-
     def test_invalid_end_tags(self):
         # A collection of broken end tags. <br> is used as separator.
         # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
@@ -773,6 +737,62 @@ def test_end_tag_in_attribute_value(self):
                           [("href", "http://www.example.org/\">;")]),
                          ("data", "spam"), ("endtag", "a")])
 
+    def test_with_unquoted_attributes(self):
+        # see #12008
+        html = ("<html><body bgcolor=d0ca90 text='181008'>"
+                "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
+                "<td align=left><font size=-1>"
+                "- <a href=/rabota/><span class=en> software-and-i</span></a>"
+                "- <a href='/1/'><span class=en> library</span></a></table>")
+        expected = [
+            ('starttag', 'html', []),
+            ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
+            ('starttag', 'table',
+                [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
+            ('starttag', 'tr', []),
+            ('starttag', 'td', [('align', 'left')]),
+            ('starttag', 'font', [('size', '-1')]),
+            ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
+            ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
+            ('endtag', 'span'), ('endtag', 'a'),
+            ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
+            ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
+            ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
+        ]
+        self._run_check(html, expected)
+
+    def test_comma_between_attributes(self):
+        # see bpo 41478
+        # HTMLParser preserves duplicate attributes, leaving the task of
+        # removing duplicate attributes to a conformant html tree builder
+        html = ('<div class=bar,baz=asd>'        # between attrs (unquoted)
+                '<div class="bar",baz="asd">'    # between attrs (quoted)
+                '<div class=bar, baz=asd,>'      # after values (unquoted)
+                '<div class="bar", baz="asd",>'  # after values (quoted)
+                '<div class="bar",>'             # one comma values (quoted)
+                '<div class=,bar baz=,asd>'      # before values (unquoted)
+                '<div class=,"bar" baz=,"asd">'  # before values (quoted)
+                '<div ,class=bar ,baz=asd>'      # before names
+                '<div class,="bar" baz,="asd">'  # after names
+        )
+        expected = [
+            ('starttag', 'div', [('class', 'bar,baz=asd'),]),
+            ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
+            ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
+            ('starttag', 'div', [('class', 'bar'), (',', None),
+                                 ('baz', 'asd'), (',', None)]),
+            ('starttag', 'div', [('class', 'bar'), (',', None)]),
+            ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
+            ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
+            ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
+            ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
+        ]
+        self._run_check(html, expected)
+
+    def test_weird_chars_in_unquoted_attribute_values(self):
+        self._run_check('<form action=bogus|&#()value>', [
+                            ('starttag', 'form',
+                                [('action', 'bogus|&#()value')])])
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst
@@ -0,0 +1,2 @@
+Fix HTMLParser parsing rules for element attributes containing
+commas with spaces. Patch by Karl Dubost.

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@`
`47`	`47`	`\|"[^"]*" # LIT-enclosed value`
`48`	`48`	`\|(?!['"])[^>\s]* # bare value`
`49`	`49`	`)`
`50`		`- (?:\s,) # possibly followed by a comma`
	`50`	`+ \s* # possibly followed by a space`
`51`	`51`	`)?(?:\s\|/(?!>))*`
`52`	`52`	`)*`
`53`	`53`	`)?`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Fix HTMLParser parsing rules for element attributes containing`
	`2`	`+commas with spaces. Patch by Karl Dubost.`