hat.json.vt
Virtual tree XML parser
1"""Virtual tree XML parser""" 2 3import os 4import xml.sax 5import xml.sax.handler 6import xml.sax.xmlreader 7 8from hat.json.data import Data 9 10 11def parse(file: os.PathLike | xml.sax.xmlreader.InputSource 12 ) -> Data: 13 r"""Parse XML document into virtual tree 14 15 Each element is recursively parsed into a list with the following 16 structure, starting from the root of a document: 17 18 * First item is a valid CSS selector string, consisting of element tag 19 name; and optionally `id` and `class` attributes if present. 20 21 * If the element has attributes other than `id` or `class`, they are 22 stored as a second item. The item is a dictionary which has an 23 `attrs` key, whose value is another dictionary, with key-value pairs 24 representing attribute names and their values, respectively. 25 26 * All other items are element content. Each item is either a 27 recursively parsed element child (a list), or text (a string). 28 29 Resulting structure is JSON serializable. 30 31 Namespace prefix declaration attributes (`xmlns:*`) and prefixed elements 32 are ignored. 33 34 Example usage:: 35 36 import io 37 from hat import json 38 39 xml = '''\ 40 <html> 41 <body> 42 <div id="first" class="c1 c2"> 43 Banana 44 </div> 45 Orange 46 <br/> 47 <span id="second" style="color:green"> 48 Watermelon 49 </span> 50 </body> 51 </html> 52 ''' 53 stripped = ''.join(line.lstrip() for line in xml.split('\n')) 54 stream = io.StringIO(stripped) 55 parsed = json.vt.parse(stream) 56 57 Output:: 58 59 ['html', 60 ['body', 61 ['div#first.c1.c2', 62 "Banana" 63 ], 64 "Orange", 65 ['br'], 66 ['span#second', 67 {'attrs': 68 {'style': "color:green"} 69 }, 70 "Watermelon" 71 ] 72 ] 73 ] 74 75 """ 76 handler = _ContentHandler() 77 parser = xml.sax.make_parser() 78 parser.setContentHandler(handler) 79 parser.setFeature(xml.sax.handler.feature_external_ges, False) 80 parser.setFeature(xml.sax.handler.feature_external_pes, False) 81 parser.parse(file) 82 return handler.root 83 84 85class _ContentHandler(xml.sax.ContentHandler): 86 87 def __init__(self): 88 self._root = None 89 self._stack = [] 90 self._skip_depth = 0 91 92 @property 93 def root(self): 94 return self._root 95 96 def startElement(self, name, attrs): 97 if ':' in name or self._skip_depth: 98 self._skip_depth += 1 99 return 100 101 attrs = {k.split(':')[-1]: v 102 for k, v in attrs.items() 103 if not k.startswith('xmlns:')} 104 attrs = dict(attrs) 105 elm_id = attrs.pop('id', '') 106 elm_class = '.'.join(i for i in attrs.pop('class', '').split(' ') if i) 107 element = [name + 108 (f'#{elm_id}' if elm_id else '') + 109 (f'.{elm_class}' if elm_class else '')] 110 if attrs: 111 element.append({'attrs': attrs}) 112 if self._stack: 113 self._stack[-1].append(element) 114 else: 115 self._root = element 116 self._stack.append(element) 117 118 def endElement(self, name): 119 if self._skip_depth: 120 self._skip_depth -= 1 121 return 122 123 self._stack.pop() 124 125 def characters(self, content): 126 if self._skip_depth: 127 return 128 129 self._stack[-1].append(content)
def
parse( file: os.PathLike | xml.sax.xmlreader.InputSource) -> None | bool | int | float | str | List[None | bool | int | float | str | List[ForwardRef('Data')] | Dict[str, ForwardRef('Data')]] | Dict[str, None | bool | int | float | str | List[ForwardRef('Data')] | Dict[str, ForwardRef('Data')]]:
12def parse(file: os.PathLike | xml.sax.xmlreader.InputSource 13 ) -> Data: 14 r"""Parse XML document into virtual tree 15 16 Each element is recursively parsed into a list with the following 17 structure, starting from the root of a document: 18 19 * First item is a valid CSS selector string, consisting of element tag 20 name; and optionally `id` and `class` attributes if present. 21 22 * If the element has attributes other than `id` or `class`, they are 23 stored as a second item. The item is a dictionary which has an 24 `attrs` key, whose value is another dictionary, with key-value pairs 25 representing attribute names and their values, respectively. 26 27 * All other items are element content. Each item is either a 28 recursively parsed element child (a list), or text (a string). 29 30 Resulting structure is JSON serializable. 31 32 Namespace prefix declaration attributes (`xmlns:*`) and prefixed elements 33 are ignored. 34 35 Example usage:: 36 37 import io 38 from hat import json 39 40 xml = '''\ 41 <html> 42 <body> 43 <div id="first" class="c1 c2"> 44 Banana 45 </div> 46 Orange 47 <br/> 48 <span id="second" style="color:green"> 49 Watermelon 50 </span> 51 </body> 52 </html> 53 ''' 54 stripped = ''.join(line.lstrip() for line in xml.split('\n')) 55 stream = io.StringIO(stripped) 56 parsed = json.vt.parse(stream) 57 58 Output:: 59 60 ['html', 61 ['body', 62 ['div#first.c1.c2', 63 "Banana" 64 ], 65 "Orange", 66 ['br'], 67 ['span#second', 68 {'attrs': 69 {'style': "color:green"} 70 }, 71 "Watermelon" 72 ] 73 ] 74 ] 75 76 """ 77 handler = _ContentHandler() 78 parser = xml.sax.make_parser() 79 parser.setContentHandler(handler) 80 parser.setFeature(xml.sax.handler.feature_external_ges, False) 81 parser.setFeature(xml.sax.handler.feature_external_pes, False) 82 parser.parse(file) 83 return handler.root
Parse XML document into virtual tree
Each element is recursively parsed into a list with the following structure, starting from the root of a document:
* First item is a valid CSS selector string, consisting of element tag
name; and optionally `id` and `class` attributes if present.
* If the element has attributes other than `id` or `class`, they are
stored as a second item. The item is a dictionary which has an
`attrs` key, whose value is another dictionary, with key-value pairs
representing attribute names and their values, respectively.
* All other items are element content. Each item is either a
recursively parsed element child (a list), or text (a string).
Resulting structure is JSON serializable.
Namespace prefix declaration attributes (xmlns:*) and prefixed elements
are ignored.
Example usage::
import io
from hat import json
xml = '''\
Banana
Orange
Watermelon
'''
stripped = ''.join(line.lstrip() for line in xml.split('\n'))
stream = io.StringIO(stripped)
parsed = json.vt.parse(stream)
Output::
['html',
['body',
['div#first.c1.c2',
"Banana"
],
"Orange",
['br'],
['span#second',
{'attrs':
{'style': "color:green"}
},
"Watermelon"
]
]
]