Module parsexml

This module implements a simple high performance XML / HTML parser. The only encoding that is supported is UTF-8. The parser has been designed to be somewhat error correcting, so that even most "wild HTML" found on the web can be parsed with it. Note: This parser does not check that each <tag> has a corresponding </tag>! These checks have do be implemented by the client code for various reasons:

Example 1: Retrieve HTML title

The file examples/htmltitle.nim demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.

# Example program to show the parsexml module
# This program reads an HTML file and writes its title to stdout.
# Errors and whitespace are ignored.

import os, streams, parsexml, strutils

if paramCount() < 1:
  quit("Usage: htmltitle filename[.html]")

var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: TXmlParser
open(x, s, filename)
while true:
  x.next()
  case x.kind
  of xmlElementStart: 
    if cmpIgnoreCase(x.elementName, "title") == 0: 
      var title = ""
      x.next()  # skip "<title>"
      while x.kind == xmlCharData: 
        title.add(x.charData)
        x.next()
      if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
        echo("Title: " & title)
        quit(0) # Success!
      else:
        echo(x.errorMsgExpected("/title"))
  
  of xmlEof: break # end of file reached
  else: discard # ignore other events

x.close()
quit("Could not determine title!")

Example 2: Retrieve all HTML links

The file examples/htmlrefs.nim demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.

# Example program to show the new parsexml module
# This program reads an HTML file and writes all its used links to stdout.
# Errors and whitespace are ignored.

import os, streams, parsexml, strutils

proc `=?=` (a, b: string): bool = 
  # little trick: define our own comparator that ignores case
  return cmpIgnoreCase(a, b) == 0

if paramCount() < 1: 
  quit("Usage: htmlrefs filename[.html]")

var links = 0 # count the number of links
var filename = addFileExt(ParamStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: TXmlParser
open(x, s, filename)
next(x) # get first event
block mainLoop:
  while true:
    case x.kind
    of xmlElementOpen: 
      # the <a href = "xyz"> tag we are interested in always has an attribute,
      # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
      if x.elementName =?= "a": 
        x.next()
        if x.kind == xmlAttribute: 
          if x.attrKey =?= "href":
            var link = x.attrValue
            inc(links)
            # skip until we have an ``xmlElementClose`` event
            while true: 
              x.next()
              case x.kind
              of xmlEof: break mainLoop
              of xmlElementClose: break
              else: discard
            x.next() # skip ``xmlElementClose``
            # now we have the description for the ``a`` element
            var desc = ""
            while x.kind == xmlCharData: 
              desc.add(x.charData)
              x.next()
            Echo(desc & ": " & link)
      else:
        x.next()      
    of xmlEof: break # end of file reached
    of xmlError: 
      Echo(errorMsg(x))
      x.next()
    else: x.next() # skip other events

echo($links & " link(s) found!")
x.close()

Types

TXmlEventKind = enum 
  xmlError,                   ## an error ocurred during parsing
  xmlEof,                     ## end of file reached
  xmlCharData,                ## character data
  xmlWhitespace,              ## whitespace has been parsed
  xmlComment,                 ## a comment has been parsed
  xmlPI,                      ## processing instruction (``<?name something ?>``)
  xmlElementStart,            ## ``<elem>``
  xmlElementEnd,              ## ``</elem>``
  xmlElementOpen,             ## ``<elem 
  xmlAttribute,               ## ``key = "value"`` pair
  xmlElementClose,            ## ``>`` 
  xmlCData,                   ## ``<![CDATA[`` ... data ... ``]]>``
  xmlEntity,                  ## &entity;
  xmlSpecial                  ## ``<! ... data ... >``
enumation of all events that may occur when parsing
TXmlError = enum 
  errNone,                    ## no error
  errEndOfCDataExpected,      ## ``]]>`` expected
  errNameExpected,            ## name expected
  errSemicolonExpected,       ## ``;`` expected
  errQmGtExpected,            ## ``?>`` expected
  errGtExpected,              ## ``>`` expected
  errEqExpected,              ## ``=`` expected
  errQuoteExpected,           ## ``"`` or ``'`` expected
  errEndOfCommentExpected     ## ``-->`` expected
enumeration that lists all errors that can occur
TXmlParseOption = enum 
  reportWhitespace,           ## report whitespace
  reportComments              ## report comments
options for the XML parser
TXmlParser = object of TBaseLexer
  a, b, c: string
  kind: TXmlEventKind
  err: TXmlError
  state: TParserState
  filename: string
  options: set[TXmlParseOption]
the parser object.

Procs

proc open(my: var TXmlParser; input: PStream; filename: string; 
          options: set[TXmlParseOption] = {}) {.raises: [E_Base], 
    tags: [FReadIO].}
initializes the parser with an input stream. Filename is only used for nice error messages. The parser's behaviour can be controlled by the options parameter: If options contains reportWhitespace a whitespace token is reported as an xmlWhitespace event. If options contains reportComments a comment token is reported as an xmlComment event.
proc close(my: var TXmlParser) {.inline, raises: [E_Base], tags: [].}
closes the parser my and its associated input stream.
proc kind(my: TXmlParser): TXmlEventKind {.inline, raises: [], tags: [].}
returns the current event type for the XML parser
proc charData(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the character data for the events: xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial
proc elementName(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the element name for the events: xmlElementStart, xmlElementEnd, xmlElementOpen
proc entityName(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the entity name for the event: xmlEntity
proc attrKey(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the attribute key for the event xmlAttribute
proc attrValue(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the attribute value for the event xmlAttribute
proc PIName(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the processing instruction name for the event xmlPI
proc PIRest(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the rest of the processing instruction for the event xmlPI
proc rawData(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the underlying 'data' string by reference. This is only used for speed hacks.
proc rawData2(my: TXmlParser): string {.inline, raises: [], tags: [].}
returns the underlying second 'data' string by reference. This is only used for speed hacks.
proc getColumn(my: TXmlParser): int {.inline, raises: [], tags: [].}
get the current column the parser has arrived at.
proc getLine(my: TXmlParser): int {.inline, raises: [], tags: [].}
get the current line the parser has arrived at.
proc getFilename(my: TXmlParser): string {.inline, raises: [], tags: [].}
get the filename of the file that the parser processes.
proc errorMsg(my: TXmlParser): string {.raises: [EInvalidValue], tags: [].}
returns a helpful error message for the event xmlError
proc errorMsgExpected(my: TXmlParser; tag: string): string {.
    raises: [EInvalidValue], tags: [].}
returns an error message "<tag> expected" in the same format as the other error messages
proc errorMsg(my: TXmlParser; msg: string): string {.raises: [EInvalidValue], 
    tags: [].}
returns an error message with text msg in the same format as the other error messages
proc next(my: var TXmlParser) {.raises: [E_Base], tags: [FReadIO].}
retrieves the first/next event. This controls the parser.
Generated: 2014-03-11 21:26:41 UTC