hat.json.vt

Virtual tree XML parser

  1"""Virtual tree XML parser"""
  2
  3import os
  4import typing
  5import xml.sax
  6import xml.sax.handler
  7import xml.sax.xmlreader
  8
  9from hat.json.data import Data
 10
 11
 12def parse(file: typing.Union[os.PathLike, xml.sax.xmlreader.InputSource]
 13          ) -> Data:
 14    r"""Parse XML document into virtual tree
 15
 16    Each element is recursively parsed into a list with the following
 17    structure, starting from the root of a document:
 18
 19        * First item is a valid CSS selector string, consisting of element tag
 20          name; and optionally `id` and `class` attributes if present.
 21
 22        * If the element has attributes other than `id` or `class`, they are
 23          stored as a second item. The item is a dictionary which has an
 24          `attrs` key, whose value is another dictionary, with key-value pairs
 25          representing attribute names and their values, respectively.
 26
 27        * All other items are element content. Each item is either a
 28          recursively parsed element child (a list), or text (a string).
 29
 30    Resulting structure is JSON serializable.
 31
 32    Namespace prefix declaration attributes (`xmlns:*`) are ignored.
 33
 34    Example usage::
 35
 36        import io
 37        from hat import json
 38
 39        xml = '''\
 40            <html>
 41                <body>
 42                    <div id="first" class="c1 c2">
 43                        Banana
 44                    </div>
 45                    Orange
 46                    <br/>
 47                    <span id="second" style="color:green">
 48                        Watermelon
 49                    </span>
 50                </body>
 51            </html>
 52        '''
 53        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
 54        stream = io.StringIO(stripped)
 55        parsed = json.vt.parse(stream)
 56
 57    Output::
 58
 59        ['html',
 60            ['body',
 61                ['div#first.c1.c2',
 62                    "Banana"
 63                ],
 64                "Orange",
 65                ['br'],
 66                ['span#second',
 67                    {'attrs':
 68                        {'style': "color:green"}
 69                    },
 70                    "Watermelon"
 71                ]
 72            ]
 73        ]
 74
 75    """
 76    handler = _ContentHandler()
 77    parser = xml.sax.make_parser()
 78    parser.setContentHandler(handler)
 79    parser.setFeature(xml.sax.handler.feature_external_ges, False)
 80    parser.setFeature(xml.sax.handler.feature_external_pes, False)
 81    parser.parse(file)
 82    return handler.root
 83
 84
 85class _ContentHandler(xml.sax.ContentHandler):
 86
 87    def __init__(self):
 88        self._root = None
 89        self._stack = []
 90
 91    @property
 92    def root(self):
 93        return self._root
 94
 95    def startElement(self, name, attrs):
 96        attrs = {k.split(':')[-1]: v
 97                 for k, v in attrs.items()
 98                 if not k.startswith('xmlns:')}
 99        attrs = dict(attrs)
100        elm_id = attrs.pop('id', '')
101        elm_class = '.'.join(i for i in attrs.pop('class', '').split(' ') if i)
102        element = [name +
103                   (f'#{elm_id}' if elm_id else '') +
104                   (f'.{elm_class}' if elm_class else '')]
105        if attrs:
106            element.append({'attrs': attrs})
107        if self._stack:
108            self._stack[-1].append(element)
109        else:
110            self._root = element
111        self._stack.append(element)
112
113    def endElement(self, name):
114        self._stack.pop()
115
116    def characters(self, content):
117        self._stack[-1].append(content)
def parse(file: Union[os.PathLike, xml.sax.xmlreader.InputSource]) -> ~Data:
13def parse(file: typing.Union[os.PathLike, xml.sax.xmlreader.InputSource]
14          ) -> Data:
15    r"""Parse XML document into virtual tree
16
17    Each element is recursively parsed into a list with the following
18    structure, starting from the root of a document:
19
20        * First item is a valid CSS selector string, consisting of element tag
21          name; and optionally `id` and `class` attributes if present.
22
23        * If the element has attributes other than `id` or `class`, they are
24          stored as a second item. The item is a dictionary which has an
25          `attrs` key, whose value is another dictionary, with key-value pairs
26          representing attribute names and their values, respectively.
27
28        * All other items are element content. Each item is either a
29          recursively parsed element child (a list), or text (a string).
30
31    Resulting structure is JSON serializable.
32
33    Namespace prefix declaration attributes (`xmlns:*`) are ignored.
34
35    Example usage::
36
37        import io
38        from hat import json
39
40        xml = '''\
41            <html>
42                <body>
43                    <div id="first" class="c1 c2">
44                        Banana
45                    </div>
46                    Orange
47                    <br/>
48                    <span id="second" style="color:green">
49                        Watermelon
50                    </span>
51                </body>
52            </html>
53        '''
54        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
55        stream = io.StringIO(stripped)
56        parsed = json.vt.parse(stream)
57
58    Output::
59
60        ['html',
61            ['body',
62                ['div#first.c1.c2',
63                    "Banana"
64                ],
65                "Orange",
66                ['br'],
67                ['span#second',
68                    {'attrs':
69                        {'style': "color:green"}
70                    },
71                    "Watermelon"
72                ]
73            ]
74        ]
75
76    """
77    handler = _ContentHandler()
78    parser = xml.sax.make_parser()
79    parser.setContentHandler(handler)
80    parser.setFeature(xml.sax.handler.feature_external_ges, False)
81    parser.setFeature(xml.sax.handler.feature_external_pes, False)
82    parser.parse(file)
83    return handler.root

Parse XML document into virtual tree

Each element is recursively parsed into a list with the following structure, starting from the root of a document:

* First item is a valid CSS selector string, consisting of element tag
  name; and optionally `id` and `class` attributes if present.

* If the element has attributes other than `id` or `class`, they are
  stored as a second item. The item is a dictionary which has an
  `attrs` key, whose value is another dictionary, with key-value pairs
  representing attribute names and their values, respectively.

* All other items are element content. Each item is either a
  recursively parsed element child (a list), or text (a string).

Resulting structure is JSON serializable.

Namespace prefix declaration attributes (xmlns:*) are ignored.

Example usage::

import io
from hat import json

xml = '''\
    <html>
        <body>
            <div id="first" class="c1 c2">
                Banana
            </div>
            Orange
            <br/>
            <span id="second" style="color:green">
                Watermelon
            </span>
        </body>
    </html>
'''
stripped = ''.join(line.lstrip() for line in xml.split('\n'))
stream = io.StringIO(stripped)
parsed = json.vt.parse(stream)

Output::

['html',
    ['body',
        ['div#first.c1.c2',
            "Banana"
        ],
        "Orange",
        ['br'],
        ['span#second',
            {'attrs':
                {'style': "color:green"}
            },
            "Watermelon"
        ]
    ]
]