hat.json.vt

Virtual tree XML parser

View Source

  1"""Virtual tree XML parser"""
  2
  3import os
  4import xml.sax
  5import xml.sax.handler
  6import xml.sax.xmlreader
  7
  8from hat.json.data import Data
  9
 10
 11def parse(file: os.PathLike | xml.sax.xmlreader.InputSource
 12          ) -> Data:
 13    r"""Parse XML document into virtual tree
 14
 15    Each element is recursively parsed into a list with the following
 16    structure, starting from the root of a document:
 17
 18        * First item is a valid CSS selector string, consisting of element tag
 19          name; and optionally `id` and `class` attributes if present.
 20
 21        * If the element has attributes other than `id` or `class`, they are
 22          stored as a second item. The item is a dictionary which has an
 23          `attrs` key, whose value is another dictionary, with key-value pairs
 24          representing attribute names and their values, respectively.
 25
 26        * All other items are element content. Each item is either a
 27          recursively parsed element child (a list), or text (a string).
 28
 29    Resulting structure is JSON serializable.
 30
 31    Namespace prefix declaration attributes (`xmlns:*`) are ignored.
 32
 33    Example usage::
 34
 35        import io
 36        from hat import json
 37
 38        xml = '''\
 39            <html>
 40                <body>
 41                    <div id="first" class="c1 c2">
 42                        Banana
 43                    </div>
 44                    Orange
 45                    <br/>
 46                    <span id="second" style="color:green">
 47                        Watermelon
 48                    </span>
 49                </body>
 50            </html>
 51        '''
 52        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
 53        stream = io.StringIO(stripped)
 54        parsed = json.vt.parse(stream)
 55
 56    Output::
 57
 58        ['html',
 59            ['body',
 60                ['div#first.c1.c2',
 61                    "Banana"
 62                ],
 63                "Orange",
 64                ['br'],
 65                ['span#second',
 66                    {'attrs':
 67                        {'style': "color:green"}
 68                    },
 69                    "Watermelon"
 70                ]
 71            ]
 72        ]
 73
 74    """
 75    handler = _ContentHandler()
 76    parser = xml.sax.make_parser()
 77    parser.setContentHandler(handler)
 78    parser.setFeature(xml.sax.handler.feature_external_ges, False)
 79    parser.setFeature(xml.sax.handler.feature_external_pes, False)
 80    parser.parse(file)
 81    return handler.root
 82
 83
 84class _ContentHandler(xml.sax.ContentHandler):
 85
 86    def __init__(self):
 87        self._root = None
 88        self._stack = []
 89
 90    @property
 91    def root(self):
 92        return self._root
 93
 94    def startElement(self, name, attrs):
 95        attrs = {k.split(':')[-1]: v
 96                 for k, v in attrs.items()
 97                 if not k.startswith('xmlns:')}
 98        attrs = dict(attrs)
 99        elm_id = attrs.pop('id', '')
100        elm_class = '.'.join(i for i in attrs.pop('class', '').split(' ') if i)
101        element = [name +
102                   (f'#{elm_id}' if elm_id else '') +
103                   (f'.{elm_class}' if elm_class else '')]
104        if attrs:
105            element.append({'attrs': attrs})
106        if self._stack:
107            self._stack[-1].append(element)
108        else:
109            self._root = element
110        self._stack.append(element)
111
112    def endElement(self, name):
113        self._stack.pop()
114
115    def characters(self, content):
116        self._stack[-1].append(content)

def parse( file: os.PathLike | xml.sax.xmlreader.InputSource) -> Union[NoneType, bool, int, float, str, List[Union[NoneType, bool, int, float, str, List[ForwardRef('Data')], Dict[str, ForwardRef('Data')]]], Dict[str, Union[NoneType, bool, int, float, str, List[ForwardRef('Data')], Dict[str, ForwardRef('Data')]]]]: View Source

12def parse(file: os.PathLike | xml.sax.xmlreader.InputSource
13          ) -> Data:
14    r"""Parse XML document into virtual tree
15
16    Each element is recursively parsed into a list with the following
17    structure, starting from the root of a document:
18
19        * First item is a valid CSS selector string, consisting of element tag
20          name; and optionally `id` and `class` attributes if present.
21
22        * If the element has attributes other than `id` or `class`, they are
23          stored as a second item. The item is a dictionary which has an
24          `attrs` key, whose value is another dictionary, with key-value pairs
25          representing attribute names and their values, respectively.
26
27        * All other items are element content. Each item is either a
28          recursively parsed element child (a list), or text (a string).
29
30    Resulting structure is JSON serializable.
31
32    Namespace prefix declaration attributes (`xmlns:*`) are ignored.
33
34    Example usage::
35
36        import io
37        from hat import json
38
39        xml = '''\
40            <html>
41                <body>
42                    <div id="first" class="c1 c2">
43                        Banana
44                    </div>
45                    Orange
46                    <br/>
47                    <span id="second" style="color:green">
48                        Watermelon
49                    </span>
50                </body>
51            </html>
52        '''
53        stripped = ''.join(line.lstrip() for line in xml.split('\n'))
54        stream = io.StringIO(stripped)
55        parsed = json.vt.parse(stream)
56
57    Output::
58
59        ['html',
60            ['body',
61                ['div#first.c1.c2',
62                    "Banana"
63                ],
64                "Orange",
65                ['br'],
66                ['span#second',
67                    {'attrs':
68                        {'style': "color:green"}
69                    },
70                    "Watermelon"
71                ]
72            ]
73        ]
74
75    """
76    handler = _ContentHandler()
77    parser = xml.sax.make_parser()
78    parser.setContentHandler(handler)
79    parser.setFeature(xml.sax.handler.feature_external_ges, False)
80    parser.setFeature(xml.sax.handler.feature_external_pes, False)
81    parser.parse(file)
82    return handler.root

Parse XML document into virtual tree

Each element is recursively parsed into a list with the following structure, starting from the root of a document:

* First item is a valid CSS selector string, consisting of element tag
  name; and optionally `id` and `class` attributes if present.

* If the element has attributes other than `id` or `class`, they are
  stored as a second item. The item is a dictionary which has an
  `attrs` key, whose value is another dictionary, with key-value pairs
  representing attribute names and their values, respectively.

* All other items are element content. Each item is either a
  recursively parsed element child (a list), or text (a string).

Resulting structure is JSON serializable.

Namespace prefix declaration attributes (xmlns:*) are ignored.

Example usage::

import io
from hat import json

xml = '''\


    
    
        
            Banana
        
        Orange
        

        
            Watermelon
        
    


'''
stripped = ''.join(line.lstrip() for line in xml.split('\n'))
stream = io.StringIO(stripped)
parsed = json.vt.parse(stream)

Output::

['html',
    ['body',
        ['div#first.c1.c2',
            "Banana"
        ],
        "Orange",
        ['br'],
        ['span#second',
            {'attrs':
                {'style': "color:green"}
            },
            "Watermelon"
        ]
    ]
]