#! /usr/bin/env python

# This code is licensed under the GPL.
# Get yourself a version here : http://www.gnu.org/copyleft/gpl.html
#
# This script takes an html file containing definitions lists embraced
# with dl, dd and dt tags and provide a parse to use with dictfmt -p
#
# Extracted from the dictfmt man page:
#
#      -p     FILE  is  formatted  with `%h' in column 0, followed by a blank,
#             followed by the headword , followed by a line containing `%d' in
#             column  0.   The  definition starts on the following line.  This
#             option was written to format Jay Kominek's elements database.
#
# You may want to use that script as follows
#
# ./html2dictfmt-p.py source.html | dictfmt -p -s 'dict desc' mydict

import sys, string, locale
from HTMLParser import HTMLParser

BASE_URL = "http://mafate.sis.pasteur.fr/olive"

def print_def(word, data):
    """ Print the current world definition """
    word = word.replace(' :', '')

    if ',' in word:
        for w in word.split(', '):
            print_def(w, data)
    else:
        print "%%h %s" % word
        print "%d"
        print "    %s" % data.strip()
        print

class MyParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_dico = False
        self.current_def = None
        self.current_tag = None
        self.accu = ""

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        if tag == 'dl':
            self.in_dico = True

        if self.in_dico and self.current_tag == 'a':
            url = ""
            for (attr, val) in attrs:
                if attr == 'http':
                    url = val
                    break

            if url[:4] != 'http':
                # Relative url, we have to add the BASE_URL info
                self.accu = self.accu + ' ' + BASE_URL + url
            else:
                self.accu = self.accu + ' ' + url

    def handle_endtag(self, tag):
        self.current_tag = None
        if tag == 'dl':
            self.in_dico = False

    def handle_data(self, data):
        if self.in_dico:
            if self.current_tag == 'dt':
                if self.accu == "":
                    if self.current_def is None:
                        self.current_def = data
                    else:
                        self.current_def = self.current_def + data
                        
                elif self.current_def is not None:
                    print_def(self.current_def, self.accu)
                    self.accu = ""
                    self.current_def = data

            elif self.current_tag in ['dd', 'pre']:
                self.accu = self.accu + data.strip()

    def handle_startendtag(self, tag, attrs):
        if self.in_dico:
            self.accu = self.accu + self.get_starttag_text()

    def handle_entityref(self, name):
        if self.in_dico:
            self.accu = self.accu + "&#%s;" % name
        
    def handle_charref(self, name):
        if self.in_dico:
            self.accu = self.accu + "&%s;" % name
        
    def close(self):
        HTMLParser.close(self)
        if self.current_def is not None:
            print_def(self.current_def, self.accu)

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print "no file to parse"
        sys.exit(0)

    filename = sys.argv[1]

    p = MyParser()
    p.feed(open(filename).read())
    p.close()
