diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2021-02-01 12:52:52 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-01 12:52:52 -0800 |
commit | 0869a713f21f4b2fe021d802cf18f1b1af53695f (patch) | |
tree | 4e14dbe1f19ed86ac0000e9d239794a1341c82b3 | |
parent | bpo-43016: Fix test_curses on platform without cursesw (GH-24405) (GH-24408) (diff) | |
download | cpython-0869a713f21f4b2fe021d802cf18f1b1af53695f.tar.gz cpython-0869a713f21f4b2fe021d802cf18f1b1af53695f.tar.bz2 cpython-0869a713f21f4b2fe021d802cf18f1b1af53695f.zip |
bpo-41748: Handles unquoted attributes with commas (GH-24072)
* bpo-41748: Adds tests for unquoted attributes with comma
* bpo-41748: Handles unquoted attributes with comma
* bpo-41748: Addresses review comments
* bpo-41748: Addresses review comments
* Adds more test cases
* Simplifies the regex for handling spaces
* bpo-41748: Moves attributes tests under the right class
* bpo-41748: Addresses review about duplicate attributes
* bpo-41748: Adds NEWS.d entry for this patch
(cherry picked from commit 9eb11a139fac5514d8456626806a68b3e3b7eafb)
Co-authored-by: Karl Dubost <karl+github@la-grange.net>
-rw-r--r-- | Lib/html/parser.py | 2 | ||||
-rw-r--r-- | Lib/test/test_htmlparser.py | 92 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst | 2 |
3 files changed, 59 insertions, 37 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index de81879a631..d19684ed117 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -47,7 +47,7 @@ locatestarttagend_tolerant = re.compile(r""" |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) - (?:\s*,)* # possibly followed by a comma + \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 326e34290ff..d0dc67d9174 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -452,42 +452,6 @@ text self._run_check('<!spacer type="block" height="25">', [('comment', 'spacer type="block" height="25"')]) - def test_with_unquoted_attributes(self): - # see #12008 - html = ("<html><body bgcolor=d0ca90 text='181008'>" - "<table cellspacing=0 cellpadding=1 width=100% ><tr>" - "<td align=left><font size=-1>" - "- <a href=/rabota/><span class=en> software-and-i</span></a>" - "- <a href='/1/'><span class=en> library</span></a></table>") - expected = [ - ('starttag', 'html', []), - ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), - ('starttag', 'table', - [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), - ('starttag', 'tr', []), - ('starttag', 'td', [('align', 'left')]), - ('starttag', 'font', [('size', '-1')]), - ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), - ('endtag', 'span'), ('endtag', 'a'), - ('data', '- '), ('starttag', 'a', [('href', '/1/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' library'), - ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') - ] - self._run_check(html, expected) - - def test_comma_between_attributes(self): - self._run_check('<form action="/xxx.php?a=1&b=2&", ' - 'method="post">', [ - ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), - (',', None), ('method', 'post')])]) - - def test_weird_chars_in_unquoted_attribute_values(self): - self._run_check('<form action=bogus|&#()value>', [ - ('starttag', 'form', - [('action', 'bogus|&#()value')])]) - def test_invalid_end_tags(self): # A collection of broken end tags. <br> is used as separator. # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state @@ -773,6 +737,62 @@ class AttributesTestCase(TestCaseBase): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) + def test_with_unquoted_attributes(self): + # see #12008 + html = ("<html><body bgcolor=d0ca90 text='181008'>" + "<table cellspacing=0 cellpadding=1 width=100% ><tr>" + "<td align=left><font size=-1>" + "- <a href=/rabota/><span class=en> software-and-i</span></a>" + "- <a href='/1/'><span class=en> library</span></a></table>") + expected = [ + ('starttag', 'html', []), + ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), + ('starttag', 'table', + [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), + ('starttag', 'tr', []), + ('starttag', 'td', [('align', 'left')]), + ('starttag', 'font', [('size', '-1')]), + ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), + ('endtag', 'span'), ('endtag', 'a'), + ('data', '- '), ('starttag', 'a', [('href', '/1/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' library'), + ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') + ] + self._run_check(html, expected) + + def test_comma_between_attributes(self): + # see bpo 41478 + # HTMLParser preserves duplicate attributes, leaving the task of + # removing duplicate attributes to a conformant html tree builder + html = ('<div class=bar,baz=asd>' # between attrs (unquoted) + '<div class="bar",baz="asd">' # between attrs (quoted) + '<div class=bar, baz=asd,>' # after values (unquoted) + '<div class="bar", baz="asd",>' # after values (quoted) + '<div class="bar",>' # one comma values (quoted) + '<div class=,bar baz=,asd>' # before values (unquoted) + '<div class=,"bar" baz=,"asd">' # before values (quoted) + '<div ,class=bar ,baz=asd>' # before names + '<div class,="bar" baz,="asd">' # after names + ) + expected = [ + ('starttag', 'div', [('class', 'bar,baz=asd'),]), + ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]), + ('starttag', 'div', [('class', 'bar'), (',', None), + ('baz', 'asd'), (',', None)]), + ('starttag', 'div', [('class', 'bar'), (',', None)]), + ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]), + ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]), + ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]), + ] + self._run_check(html, expected) + + def test_weird_chars_in_unquoted_attribute_values(self): + self._run_check('<form action=bogus|&#()value>', [ + ('starttag', 'form', + [('action', 'bogus|&#()value')])]) if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst new file mode 100644 index 00000000000..52efa3ac3d4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst @@ -0,0 +1,2 @@ +Fix HTMLParser parsing rules for element attributes containing +commas with spaces. Patch by Karl Dubost.
\ No newline at end of file |