#! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import with_statement import csv import sys import os import getopt from struct import pack from struct import unpack class DocParser(object): def __init__(self, flatxml, fileid): self.id = os.path.basename(fileid).replace('.dat','') self.flatdoc = flatxml.split('\n') self.ocrtext = [] self.link_id = [] self.link_title = [] self.link_page = [] self.dehyphen_rootid = [] self.paracont_stemid = [] self.parastems_stemid = [] # find tag if within pos to end inclusive def findinDoc(self, tagpath, pos, end) : result = None docList = self.flatdoc cnt = len(docList) if end == -1 : end = cnt else: end = min(cnt,end) foundat = -1 for j in xrange(pos, end): item = docList[j] if item.find('=') >= 0: (name, argres) = item.split('=') else : name = item argres = '' if name.endswith(tagpath) : result = argres foundat = j break return foundat, result # return list of start positions for the tagpath def posinDoc(self, tagpath): startpos = [] pos = 0 res = "" while res != None : (foundpos, res) = self.findinDoc(tagpath, pos, -1) if res != None : startpos.append(foundpos) pos = foundpos + 1 return startpos # get a description of the paragraph def getParaDescription(self, start, end): # normal paragraph (pos, pclass) = self.findinDoc('paragraph.class',start,end) # class names are an issue given topaz starts them with numerals (not allowed) # use a mix of cases, (which cause some browsers problems), and actually # attach numbers after "reclustered*" to the end to deal with reflow issues # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered pclass = pclass.lower() pclass = 'cl_' + pclass p = pclass.find('reclustered') if p > 0 : pclass = pclass[0:p+11] (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) if (sfirst != None) and (slast != None) : return pclass, int(sfirst), int(slast) # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well # so walk through this region keeping track of the first firstword, and the last lastWord # on any items that have it (pos, sfirst) = self.findinDoc('firstWord',start, end) first = int(sfirst) last = -1 for i in xrange(pos+1,end): (pos, slast) = self.findinDoc('lastWord',i,i+1) if slast != None: last = int(slast) return pclass, first, last def buildParagraph(self, cname, first, last, type, regtype) : parares = '' sep ='' br_lb = False if (regtype == 'fixed') or (regtype == 'chapterheading') : br_lb = True handle_links = False if len(self.link_id) > 0: handle_links = True if (type == 'full') or (type == 'begin') : parares += '
'
if (type == 'end'):
parares += ' '
for j in xrange(first, last) :
word = self.ocrtext[j]
sep = ' '
if handle_links:
link = self.link_id[j]
if (link > 0):
title = self.link_title[link-1]
if title == "": title='_link_'
ptarget = self.link_page[link-1] - 1
linkhtml = '' % ptarget
linkhtml += title + ''
pos = parares.rfind(title)
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
if word == '_link_' : word = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_':
if (j-1) in self.dehyphen_rootid :
word = ''
sep = ''
elif handle_links :
word = ''
sep = ''
elif br_lb :
word = '
\n'
sep = ''
else :
word = '\n'
sep = ''
if j in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
parares += word + sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '