#!/usr/bin/env python

import sys
import re

TITLE_REGEXP = re.compile(r'<h1>(.*?)</h1>', re.UNICODE | re.DOTALL)
TAG_REGEXP = re.compile(r'<[^>]*?>')

class Document(object):
    def __init__(self):
        self.content = u''

    def _check_name(self, name):
        if not re.match('\w', name, re.IGNORECASE):
            raise

    def emit_open(self):
        self.content += '<html><body>\n'
        return self

    def emit_open_head(self):
        self.content += '<html><head>\n'
        return self

    def emit_title(self, title):
        self.content += '<title>%s</title>\n' % title
        return self

    def emit_base(self, base):
        self.content += '<base href="%s"/>\n' % cgi.escape(unicode(base), True)
        return self

    def emit_open_body(self):
        self.content += '</head><body>\n'
        return self

    def emit_close(self):
        self.content += '</body></html>\n'
        return self

    def emit_break(self):
        self.content += '<br/>\n'
        return self

    def emit_zone(self, name, value):
        self.emit_open_zone(name)
        self.content += value
        self.emit_close_zone()
        return self

    def emit_open_zone(self, name):
        self._check_name(name)
        self.content += '<div yx:%s="yes">' % name
        return self

    def emit_close_zone(self):
        self.content += '</div>\n'
        return self

    def emit_attr(self, name, value):
        self._check_name(name)
        self.content += '<div yx:%(name)s="%(value)s"/>\n'  % dict(name=name, value=cgi.escape(unicode(value), True))
        return self

    def emit_meta(self, name, value):
        self._check_name(name)
        self.content += '<meta name="%(name)s" content="%(content)s"/>\n' % dict(name=name, content=cgi.escape(unicode(value), True))
        return self

    def append(self, content):
        self.content += content
        return self

    def __str__(self):
        return self.content.encode('utf-8')

    def __unicode__(self):
        return self.content

def read_doc(fname):
    f = open(fname)
    text = f.read().decode('utf-8')
    f.close()
    return text

def strip_tags(text):
    return TAG_REGEXP.sub('', text)

def main(fname):
    text = read_doc(fname)
    title = TITLE_REGEXP.search(text)
    d = Document()
    d.emit_open_head()
    if title:
        d.emit_title(strip_tags(title.group(1)))
        #sys.stderr.write(strip_tags(title.group(1).encode('utf-8')) + "\n")
    d.emit_open_body()

    d.append(text)

    d.emit_close()
    print str(d)

if __name__ == '__main__':
    main(sys.argv[1])
