hat.json.vt
Virtual tree XML parser
1"""Virtual tree XML parser""" 2 3import os 4import xml.sax 5import xml.sax.handler 6import xml.sax.xmlreader 7 8from hat.json.data import Data 9 10 11def parse(file: os.PathLike | xml.sax.xmlreader.InputSource 12 ) -> Data: 13 r"""Parse XML document into virtual tree 14 15 Each element is recursively parsed into a list with the following 16 structure, starting from the root of a document: 17 18 * First item is a valid CSS selector string, consisting of element tag 19 name; and optionally `id` and `class` attributes if present. 20 21 * If the element has attributes other than `id` or `class`, they are 22 stored as a second item. The item is a dictionary which has an 23 `attrs` key, whose value is another dictionary, with key-value pairs 24 representing attribute names and their values, respectively. 25 26 * All other items are element content. Each item is either a 27 recursively parsed element child (a list), or text (a string). 28 29 Resulting structure is JSON serializable. 30 31 Namespace prefix declaration attributes (`xmlns:*`) are ignored. 32 33 Example usage:: 34 35 import io 36 from hat import json 37 38 xml = '''\ 39 <html> 40 <body> 41 <div id="first" class="c1 c2"> 42 Banana 43 </div> 44 Orange 45 <br/> 46 <span id="second" style="color:green"> 47 Watermelon 48 </span> 49 </body> 50 </html> 51 ''' 52 stripped = ''.join(line.lstrip() for line in xml.split('\n')) 53 stream = io.StringIO(stripped) 54 parsed = json.vt.parse(stream) 55 56 Output:: 57 58 ['html', 59 ['body', 60 ['div#first.c1.c2', 61 "Banana" 62 ], 63 "Orange", 64 ['br'], 65 ['span#second', 66 {'attrs': 67 {'style': "color:green"} 68 }, 69 "Watermelon" 70 ] 71 ] 72 ] 73 74 """ 75 handler = _ContentHandler() 76 parser = xml.sax.make_parser() 77 parser.setContentHandler(handler) 78 parser.setFeature(xml.sax.handler.feature_external_ges, False) 79 parser.setFeature(xml.sax.handler.feature_external_pes, False) 80 parser.parse(file) 81 return handler.root 82 83 84class _ContentHandler(xml.sax.ContentHandler): 85 86 def __init__(self): 87 self._root = None 88 self._stack = [] 89 90 @property 91 def root(self): 92 return self._root 93 94 def startElement(self, name, attrs): 95 attrs = {k.split(':')[-1]: v 96 for k, v in attrs.items() 97 if not k.startswith('xmlns:')} 98 attrs = dict(attrs) 99 elm_id = attrs.pop('id', '') 100 elm_class = '.'.join(i for i in attrs.pop('class', '').split(' ') if i) 101 element = [name + 102 (f'#{elm_id}' if elm_id else '') + 103 (f'.{elm_class}' if elm_class else '')] 104 if attrs: 105 element.append({'attrs': attrs}) 106 if self._stack: 107 self._stack[-1].append(element) 108 else: 109 self._root = element 110 self._stack.append(element) 111 112 def endElement(self, name): 113 self._stack.pop() 114 115 def characters(self, content): 116 self._stack[-1].append(content)
def
parse( file: os.PathLike | xml.sax.xmlreader.InputSource) -> Union[NoneType, bool, int, float, str, List[Union[NoneType, bool, int, float, str, List[ForwardRef('Data')], Dict[str, ForwardRef('Data')]]], Dict[str, Union[NoneType, bool, int, float, str, List[ForwardRef('Data')], Dict[str, ForwardRef('Data')]]]]:
12def parse(file: os.PathLike | xml.sax.xmlreader.InputSource 13 ) -> Data: 14 r"""Parse XML document into virtual tree 15 16 Each element is recursively parsed into a list with the following 17 structure, starting from the root of a document: 18 19 * First item is a valid CSS selector string, consisting of element tag 20 name; and optionally `id` and `class` attributes if present. 21 22 * If the element has attributes other than `id` or `class`, they are 23 stored as a second item. The item is a dictionary which has an 24 `attrs` key, whose value is another dictionary, with key-value pairs 25 representing attribute names and their values, respectively. 26 27 * All other items are element content. Each item is either a 28 recursively parsed element child (a list), or text (a string). 29 30 Resulting structure is JSON serializable. 31 32 Namespace prefix declaration attributes (`xmlns:*`) are ignored. 33 34 Example usage:: 35 36 import io 37 from hat import json 38 39 xml = '''\ 40 <html> 41 <body> 42 <div id="first" class="c1 c2"> 43 Banana 44 </div> 45 Orange 46 <br/> 47 <span id="second" style="color:green"> 48 Watermelon 49 </span> 50 </body> 51 </html> 52 ''' 53 stripped = ''.join(line.lstrip() for line in xml.split('\n')) 54 stream = io.StringIO(stripped) 55 parsed = json.vt.parse(stream) 56 57 Output:: 58 59 ['html', 60 ['body', 61 ['div#first.c1.c2', 62 "Banana" 63 ], 64 "Orange", 65 ['br'], 66 ['span#second', 67 {'attrs': 68 {'style': "color:green"} 69 }, 70 "Watermelon" 71 ] 72 ] 73 ] 74 75 """ 76 handler = _ContentHandler() 77 parser = xml.sax.make_parser() 78 parser.setContentHandler(handler) 79 parser.setFeature(xml.sax.handler.feature_external_ges, False) 80 parser.setFeature(xml.sax.handler.feature_external_pes, False) 81 parser.parse(file) 82 return handler.root
Parse XML document into virtual tree
Each element is recursively parsed into a list with the following structure, starting from the root of a document:
* First item is a valid CSS selector string, consisting of element tag
name; and optionally `id` and `class` attributes if present.
* If the element has attributes other than `id` or `class`, they are
stored as a second item. The item is a dictionary which has an
`attrs` key, whose value is another dictionary, with key-value pairs
representing attribute names and their values, respectively.
* All other items are element content. Each item is either a
recursively parsed element child (a list), or text (a string).
Resulting structure is JSON serializable.
Namespace prefix declaration attributes (xmlns:*
) are ignored.
Example usage::
import io
from hat import json
xml = '''\
Banana
Orange
Watermelon
'''
stripped = ''.join(line.lstrip() for line in xml.split('\n'))
stream = io.StringIO(stripped)
parsed = json.vt.parse(stream)
Output::
['html',
['body',
['div#first.c1.c2',
"Banana"
],
"Orange",
['br'],
['span#second',
{'attrs':
{'style': "color:green"}
},
"Watermelon"
]
]
]