- Old HTML contains tags that have no end tag: <br> for example.
- HTML tags are case insensitive, XML tags are case sensitive. Since this library can parse both, only the client knows which comparison is to be used.
- Thus the checks would have been very difficult to implement properly with little benefit, especially since they are simple to implement in the client. The client should use the errorMsgExpected proc to generate a nice error message that fits the other error messages this library creates.
Example 1: Retrieve HTML title
The file examples/htmltitle.nim demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.
# Example program to show the parsexml module # This program reads an HTML file and writes its title to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils if paramCount() < 1: quit("Usage: htmltitle filename[.html]") var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: TXmlParser open(x, s, filename) while true: x.next() case x.kind of xmlElementStart: if cmpIgnoreCase(x.elementName, "title") == 0: var title = "" x.next() # skip "<title>" while x.kind == xmlCharData: title.add(x.charData) x.next() if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0: echo("Title: " & title) quit(0) # Success! else: echo(x.errorMsgExpected("/title")) of xmlEof: break # end of file reached else: discard # ignore other events x.close() quit("Could not determine title!")
Example 2: Retrieve all HTML links
The file examples/htmlrefs.nim demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.
# Example program to show the new parsexml module # This program reads an HTML file and writes all its used links to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils proc `=?=` (a, b: string): bool = # little trick: define our own comparator that ignores case return cmpIgnoreCase(a, b) == 0 if paramCount() < 1: quit("Usage: htmlrefs filename[.html]") var links = 0 # count the number of links var filename = addFileExt(ParamStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: TXmlParser open(x, s, filename) next(x) # get first event block mainLoop: while true: case x.kind of xmlElementOpen: # the <a href = "xyz"> tag we are interested in always has an attribute, # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart`` if x.elementName =?= "a": x.next() if x.kind == xmlAttribute: if x.attrKey =?= "href": var link = x.attrValue inc(links) # skip until we have an ``xmlElementClose`` event while true: x.next() case x.kind of xmlEof: break mainLoop of xmlElementClose: break else: discard x.next() # skip ``xmlElementClose`` # now we have the description for the ``a`` element var desc = "" while x.kind == xmlCharData: desc.add(x.charData) x.next() Echo(desc & ": " & link) else: x.next() of xmlEof: break # end of file reached of xmlError: Echo(errorMsg(x)) x.next() else: x.next() # skip other events echo($links & " link(s) found!") x.close()
Types
TXmlEventKind = enum xmlError, ## an error ocurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >``
- enumation of all events that may occur when parsing
TXmlError = enum errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected
- enumeration that lists all errors that can occur
TXmlParseOption = enum reportWhitespace, ## report whitespace reportComments ## report comments
- options for the XML parser
TXmlParser = object of TBaseLexer a, b, c: string kind: TXmlEventKind err: TXmlError state: TParserState filename: string options: set[TXmlParseOption]
- the parser object.
Procs
proc open(my: var TXmlParser; input: PStream; filename: string; options: set[TXmlParseOption] = {}) {.raises: [E_Base], tags: [FReadIO].}
- initializes the parser with an input stream. Filename is only used for nice error messages. The parser's behaviour can be controlled by the options parameter: If options contains reportWhitespace a whitespace token is reported as an xmlWhitespace event. If options contains reportComments a comment token is reported as an xmlComment event.
proc close(my: var TXmlParser) {.inline, raises: [E_Base], tags: [].}
- closes the parser my and its associated input stream.
proc kind(my: TXmlParser): TXmlEventKind {.inline, raises: [], tags: [].}
- returns the current event type for the XML parser
proc charData(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the character data for the events: xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial
proc elementName(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the element name for the events: xmlElementStart, xmlElementEnd, xmlElementOpen
proc entityName(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the entity name for the event: xmlEntity
proc attrKey(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the attribute key for the event xmlAttribute
proc attrValue(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the attribute value for the event xmlAttribute
proc PIName(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the processing instruction name for the event xmlPI
proc PIRest(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the rest of the processing instruction for the event xmlPI
proc rawData(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the underlying 'data' string by reference. This is only used for speed hacks.
proc rawData2(my: TXmlParser): string {.inline, raises: [], tags: [].}
- returns the underlying second 'data' string by reference. This is only used for speed hacks.
proc getColumn(my: TXmlParser): int {.inline, raises: [], tags: [].}
- get the current column the parser has arrived at.
proc getLine(my: TXmlParser): int {.inline, raises: [], tags: [].}
- get the current line the parser has arrived at.
proc getFilename(my: TXmlParser): string {.inline, raises: [], tags: [].}
- get the filename of the file that the parser processes.
proc errorMsg(my: TXmlParser): string {.raises: [EInvalidValue], tags: [].}
- returns a helpful error message for the event xmlError
proc errorMsgExpected(my: TXmlParser; tag: string): string {. raises: [EInvalidValue], tags: [].}
- returns an error message "<tag> expected" in the same format as the other error messages
proc errorMsg(my: TXmlParser; msg: string): string {.raises: [EInvalidValue], tags: [].}
- returns an error message with text msg in the same format as the other error messages
proc next(my: var TXmlParser) {.raises: [E_Base], tags: [FReadIO].}
- retrieves the first/next event. This controls the parser.