#! /usr/bin/env python """ this file converts simple html text into a docbook xml variant. The mapping of markups and links is far from perfect. But all we want is the docbook-to-pdf converter and similar technology being present in the world of docbook-to-anything converters. """ from datetime import date import match import sys m = match.Match class htm2dbk_conversion_base: regexlist = [ m()("(.*)", "m") >> "\n\\1", m()("<[hH]2>") >> "", m()("<[Pp]([> ])","m") >> "<para\\1", m()("</[Pp]>") >> "</para>", m()("<(pre|PRE)>") >> "<screen>", m()("</(pre|PRE)>") >> "</screen>", m()("<[hH]3>") >> "<sect2><title>", m()("</[hH]3>((?:.(?!<sect2>))*.?)", "s") >> "\\1", m()("]*>","s") >> "", m()("]*>","s") >> "", m()("(<\w+\b[^<>]*\swidth=)(\d+\%)","s") >> "\\1\"\\2\"", m()("(<\w+\b[^<>]*\s\w+=)(\d+)","s") >> "\\1\"\\2\"", m()("&&") >> "\&\;\&\;", m()("\$\<") >> "\$\<\;", m()("&(\w+[\),])") >> "\&\;\\1", m()("(]*)?>","s") >> "\\1phrase\\2>", m()("(]*)?>","s") >> "\\1note\\2>", m()("(")>> "\\1emphasis>", m()("(") >> "\\1listitem>", m()("(") >> "\\1itemizedlist>", m()("(") >> "\\1orderedlist>", m()("(") >> "\\1variablelist>", m()("]*)>","s") >> "", m()("]*)>","s") >> "", m()("]*)>","s") >> "", m()("]*)>","s") >> "", m()("]*)>","s") >> "", m()("]*)>","s") >> "", m()("(]*)?>","s") >> "\\1row\\2>", m()("(]*)?>","s") >> "\\1entry\\2>", m()("]*>\s*]*>\s*"+ "\s*]*>\s*]*>\s*> "\s*\s*"+ "\s*\s*\s*", "s") >> "", m()("(]*\swidth=\"100\%\")","s") >> "\\1 pgwide=\"1\"", m()("(\s*]*>\s*]*\s)(width=\"50\%\")","s") >> "\n\\1\\2", m()("([\'\`]*)") >> "\\1", m()("([\'\`]*)") >> "\\1", m()("<(?:tt|code)>([\`\"\'])") >> "\\1", m()("<(?:tt|code)>([\`\"\'])") >> "\\1", m()("([\`\"\'])") >> "\\1", m()("([\`\"\'])") >> "\\1", m()("(") >> "\\1constant>", m()("(") >> "\\1literal>", m()(">([^<>]+)
","s") >> ">\\1", m()("
") >> "
", # m()("") >> "", # m()("") >> "", m()("") >> "" >> 1, m()("]+)\"\s*>((?:.(?!))*.)" ,"s") >> "\\2", m()("((?:.(?!))*.)","s") >> "$2", m()("((?:.(?!))*.)","s") >> "$2", m()("((?:.(?!))*.)","s") >> "\\2", m()("((?:.(?!))*.)" ,"s") >> "\\2", m()("((?:.(?!))*.)","s") >> "\\2" # m()("(") >> "\\1para>" # $_ .= "
" if / ]/ ] regexlist2 = [ m()(r"") >> "", m()(r"(") >> r"\1emphasis>", m()(r"") >> "", m()(r"") >> "", m()(r"") >> "", m()(r"") >> "", m()(r"(?s)\s*") >> "", # m()(r"") >> "", m()(r"
    ") >> "", m()(r"
") >> "", # m()(r"
  • ") >> "", # m()(r"
  • ") >> "" m()(r"
  • ") >> "", m()(r"
  • ") >> "
    \n", ] class htm2dbk_conversion(htm2dbk_conversion_base): def __init__(self): self.version = "" # str(date.today) self.filename = "." def convert(self,text): # $text txt = text.replace("", self.version) for conv in self.regexlist: txt &= conv return txt.replace("--filename--", self.filename) def convert2(self,text): # $text txt = text.replace("", self.version) for conv in self.regexlist: txt &= conv return txt class htm2dbk_document(htm2dbk_conversion): """ create document, add(text) and get the value() """ doctype = ( ''+ "\n") book_start = 'Documentation'+"\n" book_end_chapters = ''+"\n" book_end = ''+"\n" def __init__(self): htm2dbk_conversion.__init__(self) self.text = self.doctype + self.book_start def add(self,text): if self.text & m()("","") & ( m()("([^<>]*)") >> "\\1") & ( m()("(?s)(\s*)" + "([^<>]*)") >> "\\1\\2") def value(self): return self.text + self.book_end_chapters + self.book_end def htm2dbk_files(args): doc = htm2dbk_document() for filename in args: try: f = open(filename, "r") doc.filename = filename doc.add(f.read()) f.close() except IOError, e: print >> sys.stderr, "can not open "+filename return doc.value() def html2docbook(text): """ the C comment may contain html markup - simulate with docbook tags """ return htm2dbk_conversion().convert2(text) if __name__ == "__main__": print htm2dbk_files(sys.argv[1:])