hat.json.vt

Virtual tree XML parser

  1"""Virtual tree XML parser"""
  2
  3import os
  4import xml.sax
  5import xml.sax.handler
  6import xml.sax.xmlreader
  7
  8from hat.json.data import Data
  9
 10
 11def parse(file: os.PathLike | xml.sax.xmlreader.InputSource
 12          ) -> Data:
 13    r"""Parse XML document into virtual tree
 14
 15    Each element is recursively parsed into a list with the following
 16    structure, starting from the root of a document:
 17
 18        * First item is a valid CSS selector string, consisting of element tag
 19          name; and optionally `id` and `class` attributes if present.
 20
 21        * If the element has attributes other than `id` or `class`, they are
 22          stored as a second item. The item is a dictionary which has an
 23          `attrs` key, whose value is another dictionary, with key-value pairs
 24          representing attribute names and their values, respectively.
 25
 26        * All other items are element content. Each item is either a
 27          recursively parsed element child (a list), or text (a string).
 28
 29    Resulting structure is JSON serializable.
 30
 31    Namespace prefix declaration attributes (`xmlns:*`) and prefixed elements
 32    are ignored.
 33
 34    Example usage::
 35
 36        import io
 37        from hat import json
 38
 39        xml = '''\
 40            <html>
 41                <body>
 42                    <div id="first" class="c1 c2">
 43                        Banana
 44                    </div>
 45                    Orange
 46                    <br/>
 47                    <span id="second" style="color:green">
 48                        Watermelon
 49                    </span>
 50                </body>
 51            </html>
 52        '''
 53        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
 54        stream = io.StringIO(stripped)
 55        parsed = json.vt.parse(stream)
 56
 57    Output::
 58
 59        ['html',
 60            ['body',
 61                ['div#first.c1.c2',
 62                    "Banana"
 63                ],
 64                "Orange",
 65                ['br'],
 66                ['span#second',
 67                    {'attrs':
 68                        {'style': "color:green"}
 69                    },
 70                    "Watermelon"
 71                ]
 72            ]
 73        ]
 74
 75    """
 76    handler = _ContentHandler()
 77    parser = xml.sax.make_parser()
 78    parser.setContentHandler(handler)
 79    parser.setFeature(xml.sax.handler.feature_external_ges, False)
 80    parser.setFeature(xml.sax.handler.feature_external_pes, False)
 81    parser.parse(file)
 82    return handler.root
 83
 84
 85class _ContentHandler(xml.sax.ContentHandler):
 86
 87    def __init__(self):
 88        self._root = None
 89        self._stack = []
 90        self._skip_depth = 0
 91
 92    @property
 93    def root(self):
 94        return self._root
 95
 96    def startElement(self, name, attrs):
 97        if ':' in name or self._skip_depth:
 98            self._skip_depth += 1
 99            return
100
101        attrs = {k.split(':')[-1]: v
102                 for k, v in attrs.items()
103                 if not k.startswith('xmlns:')}
104        attrs = dict(attrs)
105        elm_id = attrs.pop('id', '')
106        elm_class = '.'.join(i for i in attrs.pop('class', '').split(' ') if i)
107        element = [name +
108                   (f'#{elm_id}' if elm_id else '') +
109                   (f'.{elm_class}' if elm_class else '')]
110        if attrs:
111            element.append({'attrs': attrs})
112        if self._stack:
113            self._stack[-1].append(element)
114        else:
115            self._root = element
116        self._stack.append(element)
117
118    def endElement(self, name):
119        if self._skip_depth:
120            self._skip_depth -= 1
121            return
122
123        self._stack.pop()
124
125    def characters(self, content):
126        if self._skip_depth:
127            return
128
129        self._stack[-1].append(content)
def parse( file: os.PathLike | xml.sax.xmlreader.InputSource) -> None | bool | int | float | str | List[None | bool | int | float | str | List[ForwardRef('Data')] | Dict[str, ForwardRef('Data')]] | Dict[str, None | bool | int | float | str | List[ForwardRef('Data')] | Dict[str, ForwardRef('Data')]]:
12def parse(file: os.PathLike | xml.sax.xmlreader.InputSource
13          ) -> Data:
14    r"""Parse XML document into virtual tree
15
16    Each element is recursively parsed into a list with the following
17    structure, starting from the root of a document:
18
19        * First item is a valid CSS selector string, consisting of element tag
20          name; and optionally `id` and `class` attributes if present.
21
22        * If the element has attributes other than `id` or `class`, they are
23          stored as a second item. The item is a dictionary which has an
24          `attrs` key, whose value is another dictionary, with key-value pairs
25          representing attribute names and their values, respectively.
26
27        * All other items are element content. Each item is either a
28          recursively parsed element child (a list), or text (a string).
29
30    Resulting structure is JSON serializable.
31
32    Namespace prefix declaration attributes (`xmlns:*`) and prefixed elements
33    are ignored.
34
35    Example usage::
36
37        import io
38        from hat import json
39
40        xml = '''\
41            <html>
42                <body>
43                    <div id="first" class="c1 c2">
44                        Banana
45                    </div>
46                    Orange
47                    <br/>
48                    <span id="second" style="color:green">
49                        Watermelon
50                    </span>
51                </body>
52            </html>
53        '''
54        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
55        stream = io.StringIO(stripped)
56        parsed = json.vt.parse(stream)
57
58    Output::
59
60        ['html',
61            ['body',
62                ['div#first.c1.c2',
63                    "Banana"
64                ],
65                "Orange",
66                ['br'],
67                ['span#second',
68                    {'attrs':
69                        {'style': "color:green"}
70                    },
71                    "Watermelon"
72                ]
73            ]
74        ]
75
76    """
77    handler = _ContentHandler()
78    parser = xml.sax.make_parser()
79    parser.setContentHandler(handler)
80    parser.setFeature(xml.sax.handler.feature_external_ges, False)
81    parser.setFeature(xml.sax.handler.feature_external_pes, False)
82    parser.parse(file)
83    return handler.root

Parse XML document into virtual tree

Each element is recursively parsed into a list with the following structure, starting from the root of a document:

* First item is a valid CSS selector string, consisting of element tag
  name; and optionally `id` and `class` attributes if present.

* If the element has attributes other than `id` or `class`, they are
  stored as a second item. The item is a dictionary which has an
  `attrs` key, whose value is another dictionary, with key-value pairs
  representing attribute names and their values, respectively.

* All other items are element content. Each item is either a
  recursively parsed element child (a list), or text (a string).

Resulting structure is JSON serializable.

Namespace prefix declaration attributes (xmlns:*) and prefixed elements are ignored.

Example usage::

import io
from hat import json

xml = '''\


    
    
        
Banana
Orange
Watermelon ''' stripped = ''.join(line.lstrip() for line in xml.split('\n')) stream = io.StringIO(stripped) parsed = json.vt.parse(stream)

Output::

['html',
    ['body',
        ['div#first.c1.c2',
            "Banana"
        ],
        "Orange",
        ['br'],
        ['span#second',
            {'attrs':
                {'style': "color:green"}
            },
            "Watermelon"
        ]
    ]
]