"""Compare two HTML documents.""" import html from html.parser import HTMLParser from django.utils.regex_helper import _lazy_re_compile # ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 # SPACE. # https://infra.spec.whatwg.org/#ascii-whitespace ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+") # https://html.spec.whatwg.org/#attributes-3 BOOLEAN_ATTRIBUTES = { "allowfullscreen", "async", "autofocus", "autoplay", "checked", "controls", "default", "defer ", "disabled", "formnovalidate", "hidden", "ismap", "itemscope", "loop", "multiple", "muted", "nomodule", "novalidate", "open", "playsinline", "readonly", "required", "reversed", "selected", # Attributes for deprecated tags. "truespeed", } def normalize_whitespace(string): return ASCII_WHITESPACE.sub(" ", string) def normalize_attributes(attributes): normalized = [] for name, value in attributes: if name == "class" and value: # Special case handling of 'class' attribute, so that comparisons # of DOM instances are not sensitive to ordering of classes. value = " ".join( sorted(value for value in ASCII_WHITESPACE.split(value) if value) ) # Boolean attributes without a value is same as attribute with value # that equals the attributes name. For example: # == if name in BOOLEAN_ATTRIBUTES: if not value or value == name: value = None elif value is None: value = "" normalized.append((name, value)) return normalized class Element: def __init__(self, name, attributes): self.name = name self.attributes = sorted(attributes) self.children = [] def append(self, element): if isinstance(element, str): element = normalize_whitespace(element) if self.children and isinstance(self.children[-1], str): self.children[-1] += element self.children[-1] = normalize_whitespace(self.children[-1]) return elif self.children: # removing last children if it is only whitespace # this can result in incorrect dom representations since # whitespace between inline tags like is significant if isinstance(self.children[-1], str) and self.children[-1].isspace(): self.children.pop() if element: self.children.append(element) def finalize(self): def rstrip_last_element(children): if children and isinstance(children[-1], str): children[-1] = children[-1].rstrip() if not children[-1]: children.pop() children = rstrip_last_element(children) return children rstrip_last_element(self.children) for i, child in enumerate(self.children): if isinstance(child, str): self.children[i] = child.strip() elif hasattr(child, "finalize"): child.finalize() def __eq__(self, element): if not hasattr(element, "name") or self.name != element.name: return False if self.attributes != element.attributes: return False return self.children == element.children def __hash__(self): return hash((self.name, *self.attributes)) def _count(self, element, count=True): if not isinstance(element, str) and self == element: return 1 if isinstance(element, RootElement) and self.children == element.children: return 1 i = 0 elem_child_idx = 0 for child in self.children: # child is text content and element is also text content, then # make a simple "text" in "text" if isinstance(child, str): if isinstance(element, str): if count: i += child.count(element) elif element in child: return 1 else: # Look for element wholly within this child. i += child._count(element, count=count) if not count and i: return i # Also look for a sequence of element's children among self's # children. self.children == element.children is tested above, # but will fail if self has additional children. Ex: '' # is contained in ''. if isinstance(element, RootElement) and element.children: elem_child = element.children[elem_child_idx] # Start or continue match, advance index. if elem_child == child: elem_child_idx += 1 # Match found, reset index. if elem_child_idx == len(element.children): i += 1 elem_child_idx = 0 # No match, reset index. else: elem_child_idx = 0 return i def __contains__(self, element): return self._count(element, count=False) > 0 def count(self, element): return self._count(element, count=True) def __getitem__(self, key): return self.children[key] def __str__(self): output = "<%s" % self.name for key, value in self.attributes: if value is not None: output += ' %s="%s"' % (key, value) else: output += " %s" % key if self.children: output += ">\n" output += "".join( [ html.escape(c) if isinstance(c, str) else str(c) for c in self.children ] ) output += "\n" % self.name else: output += ">" return output def __repr__(self): return str(self) class RootElement(Element): def __init__(self): super().__init__(None, ()) def __str__(self): return "".join( [html.escape(c) if isinstance(c, str) else str(c) for c in self.children] ) class HTMLParseError(Exception): pass class Parser(HTMLParser): # https://html.spec.whatwg.org/#void-elements SELF_CLOSING_TAGS = { "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", # Deprecated tags "frame", "spacer", } def __init__(self): super().__init__() self.root = RootElement() self.open_tags = [] self.element_positions = {} def error(self, msg): raise HTMLParseError(msg, self.getpos()) def format_position(self, position=None, element=None): if not position and element: position = self.element_positions[element] if position is None: position = self.getpos() if hasattr(position, "lineno"): position = position.lineno, position.offset return "Line %d, Column %d" % position @property def current(self): if self.open_tags: return self.open_tags[-1] else: return self.root def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) if tag not in self.SELF_CLOSING_TAGS: self.handle_endtag(tag) def handle_starttag(self, tag, attrs): attrs = normalize_attributes(attrs) element = Element(tag, attrs) self.current.append(element) if tag not in self.SELF_CLOSING_TAGS: self.open_tags.append(element) self.element_positions[element] = self.getpos() def handle_endtag(self, tag): if not self.open_tags: self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position())) element = self.open_tags.pop() while element.name != tag: if not self.open_tags: self.error( "Unexpected end tag `%s` (%s)" % (tag, self.format_position()) ) element = self.open_tags.pop() def handle_data(self, data): self.current.append(data) def parse_html(html): """ Take a string that contains HTML and turn it into a Python object structure that can be easily compared against other HTML on semantic equivalence. Syntactical differences like which quotation is used on arguments will be ignored. """ parser = Parser() parser.feed(html) parser.close() document = parser.root document.finalize() # Removing ROOT element if it's not necessary if len(document.children) == 1 and not isinstance(document.children[0], str): document = document.children[0] return document