1 // Copyright (c) 2012-2020 Ugorji Nwoke. All rights reserved. 2 // Use of this source code is governed by a MIT license found in the LICENSE file. 3 4 //go:build ignore 5 // +build ignore 6 7 package codec 8 9 /* 10 11 A strict Non-validating namespace-aware XML 1.0 parser and (en|de)coder. 12 13 We are attempting this due to perceived issues with encoding/xml: 14 - Complicated. It tried to do too much, and is not as simple to use as json. 15 - Due to over-engineering, reflection is over-used AND performance suffers: 16 java is 6X faster:http://fabsk.eu/blog/category/informatique/dev/golang/ 17 even PYTHON performs better: http://outgoing.typepad.com/outgoing/2014/07/exploring-golang.html 18 19 codec framework will offer the following benefits 20 - VASTLY improved performance (when using reflection-mode or codecgen) 21 - simplicity and consistency: with the rest of the supported formats 22 - all other benefits of codec framework (streaming, codegeneration, etc) 23 24 codec is not a drop-in replacement for encoding/xml. 25 It is a replacement, based on the simplicity and performance of codec. 26 Look at it like JAXB for Go. 27 28 Challenges: 29 - Need to output XML preamble, with all namespaces at the right location in the output. 30 - Each "end" block is dynamic, so we need to maintain a context-aware stack 31 - How to decide when to use an attribute VS an element 32 - How to handle chardata, attr, comment EXPLICITLY. 33 - Should it output fragments? 34 e.g. encoding a bool should just output true OR false, which is not well-formed XML. 35 36 Extend the struct tag. See representative example: 37 type X struct { 38 ID uint8 `codec:"http://ugorji.net/x-namespace xid id,omitempty,toarray,attr,cdata"` 39 // format: [namespace-uri ][namespace-prefix ]local-name, ... 40 } 41 42 Based on this, we encode 43 - fields as elements, BUT 44 encode as attributes if struct tag contains ",attr" and is a scalar (bool, number or string) 45 - text as entity-escaped text, BUT encode as CDATA if struct tag contains ",cdata". 46 47 To handle namespaces: 48 - XMLHandle is denoted as being namespace-aware. 49 Consequently, we WILL use the ns:name pair to encode and decode if defined, else use the plain name. 50 - *Encoder and *Decoder know whether the Handle "prefers" namespaces. 51 - add *Encoder.getEncName(*structFieldInfo). 52 No one calls *structFieldInfo.indexForEncName directly anymore 53 - OR better yet: indexForEncName is namespace-aware, and helper.go is all namespace-aware 54 indexForEncName takes a parameter of the form namespace:local-name OR local-name 55 - add *Decoder.getStructFieldInfo(encName string) // encName here is either like abc, or h1:nsabc 56 by being a method on *Decoder, or maybe a method on the Handle itself. 57 No one accesses .encName anymore 58 - let encode.go and decode.go use these (for consistency) 59 - only problem exists for gen.go, where we create a big switch on encName. 60 Now, we also have to add a switch on strings.endsWith(kName, encNsName) 61 - gen.go will need to have many more methods, and then double-on the 2 switch loops like: 62 switch k { 63 case "abc" : x.abc() 64 case "def" : x.def() 65 default { 66 switch { 67 case !nsAware: panic(...) 68 case strings.endsWith(":abc"): x.abc() 69 case strings.endsWith(":def"): x.def() 70 default: panic(...) 71 } 72 } 73 } 74 75 The structure below accommodates this: 76 77 type typeInfo struct { 78 sfi []*structFieldInfo // sorted by encName 79 sfins // sorted by namespace 80 sfia // sorted, to have those with attributes at the top. Needed to write XML appropriately. 81 sfip // unsorted 82 } 83 type structFieldInfo struct { 84 encName 85 nsEncName 86 ns string 87 attr bool 88 cdata bool 89 } 90 91 indexForEncName is now an internal helper function that takes a sorted array 92 (one of ti.sfins or ti.sfi). It is only used by *Encoder.getStructFieldInfo(...) 93 94 There will be a separate parser from the builder. 95 The parser will have a method: next() xmlToken method. It has lookahead support, 96 so you can pop multiple tokens, make a determination, and push them back in the order popped. 97 This will be needed to determine whether we are "nakedly" decoding a container or not. 98 The stack will be implemented using a slice and push/pop happens at the [0] element. 99 100 xmlToken has fields: 101 - type uint8: 0 | ElementStart | ElementEnd | AttrKey | AttrVal | Text 102 - value string 103 - ns string 104 105 SEE: http://www.xml.com/pub/a/98/10/guide0.html?page=3#ENTDECL 106 107 The following are skipped when parsing: 108 - External Entities (from external file) 109 - Notation Declaration e.g. <!NOTATION GIF87A SYSTEM "GIF"> 110 - Entity Declarations & References 111 - XML Declaration (assume UTF-8) 112 - XML Directive i.e. <! ... > 113 - Other Declarations: Notation, etc. 114 - Comment 115 - Processing Instruction 116 - schema / DTD for validation: 117 We are not a VALIDATING parser. Validation is done elsewhere. 118 However, some parts of the DTD internal subset are used (SEE BELOW). 119 For Attribute List Declarations e.g. 120 <!ATTLIST foo:oldjoke name ID #REQUIRED label CDATA #IMPLIED status ( funny | notfunny ) 'funny' > 121 We considered using the ATTLIST to get "default" value, but not to validate the contents. (VETOED) 122 123 The following XML features are supported 124 - Namespace 125 - Element 126 - Attribute 127 - cdata 128 - Unicode escape 129 130 The following DTD (when as an internal sub-set) features are supported: 131 - Internal Entities e.g. 132 <!ELEMENT burns "ugorji is cool" > AND entities for the set: [<>&"'] 133 - Parameter entities e.g. 134 <!ENTITY % personcontent "ugorji is cool"> <!ELEMENT burns (%personcontent;)*> 135 136 At decode time, a structure containing the following is kept 137 - namespace mapping 138 - default attribute values 139 - all internal entities (<>&"' and others written in the document) 140 141 When decode starts, it parses XML namespace declarations and creates a map in the 142 xmlDecDriver. While parsing, that map continuously gets updated. 143 The only problem happens when a namespace declaration happens on the node that it defines. 144 e.g. <hn:name xmlns:hn="http://www.ugorji.net" > 145 To handle this, each Element must be fully parsed at a time, 146 even if it amounts to multiple tokens which are returned one at a time on request. 147 148 xmlns is a special attribute name. 149 - It is used to define namespaces, including the default 150 - It is never returned as an AttrKey or AttrVal. 151 *We may decide later to allow user to use it e.g. you want to parse the xmlns mappings into a field.* 152 153 Number, bool, null, mapKey, etc can all be decoded from any xmlToken. 154 This accommodates map[int]string for example. 155 156 It should be possible to create a schema from the types, 157 or vice versa (generate types from schema with appropriate tags). 158 This is however out-of-scope from this parsing project. 159 160 We should write all namespace information at the first point that it is referenced in the tree, 161 and use the mapping for all child nodes and attributes. This means that state is maintained 162 at a point in the tree. This also means that calls to Decode or MustDecode will reset some state. 163 164 When decoding, it is important to keep track of entity references and default attribute values. 165 It seems these can only be stored in the DTD components. We should honor them when decoding. 166 167 Configuration for XMLHandle will look like this: 168 169 XMLHandle 170 DefaultNS string 171 // Encoding: 172 NS map[string]string // ns URI to key, used for encoding 173 // Decoding: in case ENTITY declared in external schema or dtd, store info needed here 174 Entities map[string]string // map of entity rep to character 175 176 177 During encode, if a namespace mapping is not defined for a namespace found on a struct, 178 then we create a mapping for it using nsN (where N is 1..1000000, and doesn't conflict 179 with any other namespace mapping). 180 181 Note that different fields in a struct can have different namespaces. 182 However, all fields will default to the namespace on the _struct field (if defined). 183 184 An XML document is a name, a map of attributes and a list of children. 185 Consequently, we cannot "DecodeNaked" into a map[string]interface{} (for example). 186 We have to "DecodeNaked" into something that resembles XML data. 187 188 To support DecodeNaked (decode into nil interface{}), we have to define some "supporting" types: 189 type Name struct { // Preferred. Less allocations due to conversions. 190 Local string 191 Space string 192 } 193 type Element struct { 194 Name Name 195 Attrs map[Name]string 196 Children []interface{} // each child is either *Element or string 197 } 198 Only two "supporting" types are exposed for XML: Name and Element. 199 200 // ------------------ 201 202 We considered 'type Name string' where Name is like "Space Local" (space-separated). 203 We decided against it, because each creation of a name would lead to 204 double allocation (first convert []byte to string, then concatenate them into a string). 205 The benefit is that it is faster to read Attrs from a map. But given that Element is a value 206 object, we want to eschew methods and have public exposed variables. 207 208 We also considered the following, where xml types were not value objects, and we used 209 intelligent accessor methods to extract information and for performance. 210 *** WE DECIDED AGAINST THIS. *** 211 type Attr struct { 212 Name Name 213 Value string 214 } 215 // Element is a ValueObject: There are no accessor methods. 216 // Make element self-contained. 217 type Element struct { 218 Name Name 219 attrsMap map[string]string // where key is "Space Local" 220 attrs []Attr 221 childrenT []string 222 childrenE []Element 223 childrenI []int // each child is a index into T or E. 224 } 225 func (x *Element) child(i) interface{} // returns string or *Element 226 227 // ------------------ 228 229 Per XML spec and our default handling, white space is always treated as 230 insignificant between elements, except in a text node. The xml:space='preserve' 231 attribute is ignored. 232 233 **Note: there is no xml: namespace. The xml: attributes were defined before namespaces.** 234 **So treat them as just "directives" that should be interpreted to mean something**. 235 236 On encoding, we support indenting aka prettifying markup in the same way we support it for json. 237 238 A document or element can only be encoded/decoded from/to a struct. In this mode: 239 - struct name maps to element name (or tag-info from _struct field) 240 - fields are mapped to child elements or attributes 241 242 A map is either encoded as attributes on current element, or as a set of child elements. 243 Maps are encoded as attributes iff their keys and values are primitives (number, bool, string). 244 245 A list is encoded as a set of child elements. 246 247 Primitives (number, bool, string) are encoded as an element, attribute or text 248 depending on the context. 249 250 Extensions must encode themselves as a text string. 251 252 Encoding is tough, specifically when encoding mappings, because we need to encode 253 as either attribute or element. To do this, we need to default to encoding as attributes, 254 and then let Encoder inform the Handle when to start encoding as nodes. 255 i.e. Encoder does something like: 256 257 h.EncodeMapStart() 258 h.Encode(), h.Encode(), ... 259 h.EncodeMapNotAttrSignal() // this is not a bool, because it's a signal 260 h.Encode(), h.Encode(), ... 261 h.EncodeEnd() 262 263 Only XMLHandle understands this, and will set itself to start encoding as elements. 264 265 This support extends to maps. For example, if a struct field is a map, and it has 266 the struct tag signifying it should be attr, then all its fields are encoded as attributes. 267 e.g. 268 269 type X struct { 270 M map[string]int `codec:"m,attr"` // encode keys as attributes named 271 } 272 273 Question: 274 - if encoding a map, what if map keys have spaces in them??? 275 Then they cannot be attributes or child elements. Error. 276 277 Options to consider adding later: 278 - For attribute values, normalize by trimming beginning and ending white space, 279 and converting every white space sequence to a single space. 280 - ATTLIST restrictions are enforced. 281 e.g. default value of xml:space, skipping xml:XYZ style attributes, etc. 282 - Consider supporting NON-STRICT mode (e.g. to handle HTML parsing). 283 Some elements e.g. br, hr, etc need not close and should be auto-closed 284 ... (see http://www.w3.org/TR/html4/loose.dtd) 285 An expansive set of entities are pre-defined. 286 - Have easy way to create a HTML parser: 287 add a HTML() method to XMLHandle, that will set Strict=false, specify AutoClose, 288 and add HTML Entities to the list. 289 - Support validating element/attribute XMLName before writing it. 290 Keep this behind a flag, which is set to false by default (for performance). 291 type XMLHandle struct { 292 CheckName bool 293 } 294 295 Misc: 296 297 ROADMAP (1 weeks): 298 - build encoder (1 day) 299 - build decoder (based off xmlParser) (1 day) 300 - implement xmlParser (2 days). 301 Look at encoding/xml for inspiration. 302 - integrate and TEST (1 days) 303 - write article and post it (1 day) 304 305 // ---------- MORE NOTES FROM 2017-11-30 ------------ 306 307 when parsing 308 - parse the attributes first 309 - then parse the nodes 310 311 basically: 312 - if encoding a field: we use the field name for the wrapper 313 - if encoding a non-field, then just use the element type name 314 315 map[string]string ==> <map><key>abc</key><value>val</value></map>... or 316 <map key="abc">val</map>... OR 317 <key1>val1</key1><key2>val2</key2>... <- PREFERED 318 []string ==> <string>v1</string><string>v2</string>... 319 string v1 ==> <string>v1</string> 320 bool true ==> <bool>true</bool> 321 float 1.0 ==> <float>1.0</float> 322 ... 323 324 F1 map[string]string ==> <F1><key>abc</key><value>val</value></F1>... OR 325 <F1 key="abc">val</F1>... OR 326 <F1><abc>val</abc>...</F1> <- PREFERED 327 F2 []string ==> <F2>v1</F2><F2>v2</F2>... 328 F3 bool ==> <F3>true</F3> 329 ... 330 331 - a scalar is encoded as: 332 (value) of type T ==> <T><value/></T> 333 (value) of field F ==> <F><value/></F> 334 - A kv-pair is encoded as: 335 (key,value) ==> <map><key><value/></key></map> OR <map key="value"> 336 (key,value) of field F ==> <F><key><value/></key></F> OR <F key="value"> 337 - A map or struct is just a list of kv-pairs 338 - A list is encoded as sequences of same node e.g. 339 <F1 key1="value11"> 340 <F1 key2="value12"> 341 <F2>value21</F2> 342 <F2>value22</F2> 343 - we may have to singularize the field name, when entering into xml, 344 and pluralize them when encoding. 345 - bi-directional encode->decode->encode is not a MUST. 346 even encoding/xml cannot decode correctly what was encoded: 347 348 see https://play.golang.org/p/224V_nyhMS 349 func main() { 350 fmt.Println("Hello, playground") 351 v := []interface{}{"hello", 1, true, nil, time.Now()} 352 s, err := xml.Marshal(v) 353 fmt.Printf("err: %v, \ns: %s\n", err, s) 354 var v2 []interface{} 355 err = xml.Unmarshal(s, &v2) 356 fmt.Printf("err: %v, \nv2: %v\n", err, v2) 357 type T struct { 358 V []interface{} 359 } 360 v3 := T{V: v} 361 s, err = xml.Marshal(v3) 362 fmt.Printf("err: %v, \ns: %s\n", err, s) 363 var v4 T 364 err = xml.Unmarshal(s, &v4) 365 fmt.Printf("err: %v, \nv4: %v\n", err, v4) 366 } 367 Output: 368 err: <nil>, 369 s: <string>hello</string><int>1</int><bool>true</bool><Time>2009-11-10T23:00:00Z</Time> 370 err: <nil>, 371 v2: [<nil>] 372 err: <nil>, 373 s: <T><V>hello</V><V>1</V><V>true</V><V>2009-11-10T23:00:00Z</V></T> 374 err: <nil>, 375 v4: {[<nil> <nil> <nil> <nil>]} 376 - 377 */ 378 379 // ----------- PARSER ------------------- 380 381 type xmlTokenType uint8 382 383 const ( 384 _ xmlTokenType = iota << 1 385 xmlTokenElemStart 386 xmlTokenElemEnd 387 xmlTokenAttrKey 388 xmlTokenAttrVal 389 xmlTokenText 390 ) 391 392 type xmlToken struct { 393 Type xmlTokenType 394 Value string 395 Namespace string // blank for AttrVal and Text 396 } 397 398 type xmlParser struct { 399 r decReader 400 toks []xmlToken // list of tokens. 401 ptr int // ptr into the toks slice 402 done bool // nothing else to parse. r now returns EOF. 403 } 404 405 func (x *xmlParser) next() (t *xmlToken) { 406 // once x.done, or x.ptr == len(x.toks) == 0, then return nil (to signify finish) 407 if !x.done && len(x.toks) == 0 { 408 x.nextTag() 409 } 410 // parses one element at a time (into possible many tokens) 411 if x.ptr < len(x.toks) { 412 t = &(x.toks[x.ptr]) 413 x.ptr++ 414 if x.ptr == len(x.toks) { 415 x.ptr = 0 416 x.toks = x.toks[:0] 417 } 418 } 419 return 420 } 421 422 // nextTag will parses the next element and fill up toks. 423 // It set done flag if/once EOF is reached. 424 func (x *xmlParser) nextTag() { 425 // ... 426 } 427 428 // ----------- ENCODER ------------------- 429 430 type xmlEncDriver struct { 431 e *Encoder 432 w encWriter 433 h *XMLHandle 434 b [64]byte // scratch 435 bs []byte // scratch 436 // s jsonStack 437 noBuiltInTypes 438 } 439 440 // ----------- DECODER ------------------- 441 442 type xmlDecDriver struct { 443 d *Decoder 444 h *XMLHandle 445 r decReader // *bytesDecReader decReader 446 ct valueType // container type. one of unset, array or map. 447 bstr [8]byte // scratch used for string \UXXX parsing 448 b [64]byte // scratch 449 450 // wsSkipped bool // whitespace skipped 451 452 // s jsonStack 453 454 noBuiltInTypes 455 } 456 457 // DecodeNaked will decode into an XMLNode 458 459 // XMLName is a value object representing a namespace-aware NAME 460 type XMLName struct { 461 Local string 462 Space string 463 } 464 465 // XMLNode represents a "union" of the different types of XML Nodes. 466 // Only one of fields (Text or *Element) is set. 467 type XMLNode struct { 468 Element *Element 469 Text string 470 } 471 472 // XMLElement is a value object representing an fully-parsed XML element. 473 type XMLElement struct { 474 Name Name 475 Attrs map[XMLName]string 476 // Children is a list of child nodes, each being a *XMLElement or string 477 Children []XMLNode 478 } 479 480 // ----------- HANDLE ------------------- 481 482 type XMLHandle struct { 483 BasicHandle 484 textEncodingType 485 486 DefaultNS string 487 NS map[string]string // ns URI to key, for encoding 488 Entities map[string]string // entity representation to string, for encoding. 489 } 490 491 func (h *XMLHandle) newEncDriver(e *Encoder) encDriver { 492 return &xmlEncDriver{e: e, w: e.w, h: h} 493 } 494 495 func (h *XMLHandle) newDecDriver(d *Decoder) decDriver { 496 // d := xmlDecDriver{r: r.(*bytesDecReader), h: h} 497 hd := xmlDecDriver{d: d, r: d.r, h: h} 498 hd.n.bytes = d.b[:] 499 return &hd 500 } 501 502 var _ decDriver = (*xmlDecDriver)(nil) 503 var _ encDriver = (*xmlEncDriver)(nil) 504