ElementTree provides parse to and from file,
some iteration mechanisms and finding utils,
including XPath and by-tag,
but no CSS Selectors, or by-attrs.
It can also be instantiated with root Element.
tree = ET.ElementTree(root)
for _ in tree.iter():
...
Tree can be written to a path / buffer without indentation.
tree.write(sys.stdout.buffer)
print()
Parsing XHTML with html.HTMLParser
In its simplest form it provides handlers for
start and end tags, with extracted tag name and
attributes as (key, value) pairs.
This prints all elements presenting href.
from html.parser import HTMLParser
class LinkParser(HTMLParser):
def handle_startendtag(self, tag, attrs):
if "href" in attrs:
print(tag, attrs)
The next articles present a more complete
DOM construction from the start / end tags.
Supporting class implementing the tree
def repr_attrs(**attrs):
return ' '*int(bool(attrs)) + ' '.join(
'%s="%s"' % _ for _ in attrs.items()
if not _[0][0] == '_')
class Element:
parents = property(lambda _: () if not _._prev\
else (_._prev, *_._prev.parents))
def __init__(self, tag, *children, **attrs):
self._tag = tag
self._text = ''
self._prev = None
self._next = [*children]
self.__dict__ |= attrs
def __iter__(self):
yield from self._next
def __repr__(self):
return '[%s%s]' \
% (self._tag, repr_attrs(**self.__dict__))
def nest(self, tag, attrs, parent):
child = Element(tag, _prev=parent, **dict(attrs))
self._next.append(child); return child
def attr(self, key):
return self.__dict__.get(key, None)
Using HTMLParser to build the Element Tree
from html.parser import HTMLParser
from pathlib import Path
class ElementParser(HTMLParser):
# valid startendtag void elements in HTML
VOID = (*"meta link base embed".split(),
*"col area hr br wbr input img track".split(),
# avoiding SVG tags
*"use defs circle ellipse rect".split(),
*"path line polyline".split())
def __init__(self, path):
super().__init__()
self.doc = Element('#doc', path=path)
self.co = self.doc
self.feed(Path(path).read_text())
def handle_data(self, data):
self.co._text += data
def handle_starttag(self, tag, attrs):
assert not tag in self.VOID, tag
self.co = self.co.nest(tag, attrs, self.co)
def handle_startendtag(self, tag, attrs):
assert tag in self.VOID, tag
self.co.nest(tag, attrs, self.co)
def handle_endtag(self, tag):
if not tag == (opened := self.co._tag):
raise ValueError(tag, opened)
self.co = self.co._prev
Walking the resulting Tree
Printing all the href in some index.html
def walk_dom(el):
for el in el:
yield el
yield from walk_dom(el)
for el in walk_dom(ElementParser('index.html').doc):
if el._tag == 'a' and (href := el.attr("href")):
print(href)