lg2txt.py

################################################################
# lg2txt.py
#
# Translate a label graph to a text file of different formats.
#
# NOTE: this program assumes that horizontal adjacency 
#   is indicated using 'HOR' or 'R' edge labels, superscripts
#   and subscripts by 'SUP' and 'SUB.'
#
# Author: R. Zanibbi, June 2012
# Copyright (c) 2012-2014 Richard Zanibbi and Harold Mouchere
################################################################

import os
import re
import sys
import csv
from bs4 import BeautifulSoup
from lg import *

def readtranslateFile(mapFile):
    """Read in symbol and structure mappings from a file."""
    try:
        fileReader = csv.reader(open(mapFile))
    except:
        sys.stderr.write('  !! IO Error (cannot open): ' + mapFile + '\n')
        return

    symbolsMap = {}
    relationsMap = {}
    readingSymbols = True
    for row in fileReader:
        # Skip blank lines and comments.
        if len(row) == 0:
            continue
        elif row[0].strip()[0] == '#':
            continue
        elif row[0].strip() == 'SYMBOLS':
            readingSymbols = True
        elif row[0].strip() == 'RELATIONSHIPS':
            readingSymbols = False
        else:
            if readingSymbols:
                symbolsMap[row[0]] = row[1]
            else:
                relationsMap[row[0]] = row[1]
    return symbolsMap, relationsMap

def readMapFile(fileName):
    """Read in symbol and structure mappings from a file."""
    try:
        fileReader = csv.reader(open(fileName))
    except:
        sys.stderr.write('  !! IO Error (cannot open): ' + fileName + '\n')
        return

    symbolMap = {}
    structureMap = {}
    readingSymbols = True
    for row in fileReader:
        # Skip blank lines and comments.
        if len(row) == 0:
            continue
        elif row[0].strip()[0] == '#':
            continue
        elif row[0].strip() == 'SYMBOLS':
            readingSymbols = True
        elif row[0].strip() == 'STRUCTURE':
            readingSymbols = False
        else:
            pattern = []
            replacement = []
            
            i = 0
            while not row[i].strip() == '->':
                pattern += [ row[i] ]
                i += 1
            i += 1
            while i < len(row):
                replacement += [ row[i] ]
                i += 1

            if len(pattern) > 1:
                relations = sorted( pattern[1:len(pattern)] )
                ptuple = tuple( [ pattern[0] ] + relations )
            else:
                ptuple = ( pattern[0] )

            if len(replacement) > 1:
                rtuple = tuple(replacement)
            else:
                rtuple = ( replacement[0] )

            if readingSymbols:
                symbolMap[ptuple] = rtuple
            else:
                structureMap[ptuple] = rtuple

    return (symbolMap, structureMap)


def translateStructure( lg, label, nodeRelationPairs, structureMap,\
            segPrimMap, edgeMap, symbolMap, segId, nodeString):
    """Generate a string for a given structure."""
    strString = ""
    byValue = lambda pair: pair[1]  
    sortedNodeRelationPairs = sorted(nodeRelationPairs, key=byValue)
    queryList = [ label ]

    primListString = ""
    for primitiveId in sorted(list(segPrimMap[ segId ][0])):
        primListString += primitiveId + ':'
            
    for (childId, relation) in sortedNodeRelationPairs:
        queryList += [ relation ]

    #print(primListString)
    #print(queryList)
    
    # Obtain the replacement, provided as an ordered sequence of
    # regions, giving the order in which to map subregions.
    key = tuple(queryList)
    anyKey = tuple(['ANY'] + queryList[1:])

    #print("key: " + str(key))
    #print(list(structureMap))
    if key in list(structureMap):
        replacementTuple = structureMap[ key ]
        #print("replacement: " + str(replacementTuple))

        # Find the node that matches each relation in the passed list,
        # and generate the appropriate string.
        for i in range(0,len(replacementTuple)):
            nextRelation = replacementTuple[ i ]

            match = False
            for j in range(0,len(nodeRelationPairs)):
                (childId, relation) = nodeRelationPairs[j]
                if relation == nextRelation:
                    strString += translate(lg, childId, segPrimMap,\
                                    edgeMap, symbolMap, structureMap) 
                    match = True
                    break
            # RZ, Jan 2013: allow other tags to be inserted (e.g. at end);
            # add primitive ids as identifier for symbols with multiple
            # subregions (e.g. fractions, roots)
            if not match:
                strString += replacementTuple[i].replace('_I_','\"' + \
                                primListString + '\"')

    # HACK!!! Copying and modifying above conditional branch.
    elif anyKey in list(structureMap):
        replacementTuple = structureMap[ anyKey ]
        #print("replacement: " + str(replacementTuple))

        # Find the node that matches each relation in the passed list,
        # and generate the appropriate string.
        for i in range(0,len(replacementTuple)):
            nextRelation = replacementTuple[ i ]

            match = False
            for j in range(0,len(nodeRelationPairs)):
                (childId, relation) = nodeRelationPairs[j]
                if relation == nextRelation:
                    strString += translate(lg, childId, segPrimMap,\
                                    edgeMap, symbolMap, structureMap) 
                    match = True
                    break
                elif nextRelation == 'PARENT':
                    strString += nodeString
                    match = True
                    break

            # RZ, Jan 2013: allow other tags to be inserted (e.g. at end);
            # add primitive ids as identifier for symbols with multiple
            # subregions (e.g. fractions, roots)
            if not match:
                    strString += replacementTuple[i].replace('_I_','\"' + \
                                    primListString + '\"')


    return strString

def translateRelation(lg, relation, nextChildId, structureMap,
                segPrimMap, edgeMap, symbolMap, nodeString):
    """Translate an individual spatial relation."""
    relString = ""
    replacementTuple = ()

    if relation in list(structureMap):
        replacementTuple = structureMap[ relation ]

    else:
        sys.stderr.write("  !! Error: Unknown relationship label " + relation + "\n")
        sys.stderr.write("  !!        Using relationship mapping: " + str(structureMap[ 'REL_DEFAULT' ]) + "\n")
        # Use default mapping if label is unknown.
        replacementList = list( structureMap[ 'REL_DEFAULT' ] )
        for i in range(0,len(replacementList)):
            replacementList[i] = replacementList[i].replace('_L_', \
                            relation)
        replacementTuple = tuple( replacementList )

    for i in range(0,len(replacementTuple)):
            nextEntry = replacementTuple[ i ]
            if nextEntry == "PARENT":
                # Add current symbol at this location
                relString += nodeString
            elif nextEntry == "CHILD":
                relString += translate(lg, nextChildId, segPrimMap, edgeMap, symbolMap, structureMap)
            else:
                relString += replacementTuple[i]

    return relString


def translate(lg, segId,  segPrimMap, edgeMap,  symbolMap, structureMap):
    """Recursively create output for an expression at the object level."""
    byValue = lambda pair: pair[1]  
    byRel = lambda pair: pair[0]

    oneSegPrimitive = list(segPrimMap[ segId ][0])[0]
    labelValuePairs = sorted(lg.nlabels[ oneSegPrimitive ].items(), key=byValue)
    (label, value) = labelValuePairs[0]

    nodeString = label  

    # Create label identifying primitives in the object.
    primListString = ""
    for primitiveId in sorted(list(segPrimMap[ segId ][0])):
        primListString += primitiveId + ':'

    if label in symbolMap:
        nodeString = symbolMap[ label ].replace('_I_','\"' + \
                            primListString + '\"')
    else:
            # Treat all unknowns uniformly.
        nodeString = symbolMap[ 'OBJ_DEFAULT' ].replace('_I_','\"' + \
                        primListString + '\"').replace('_L_',label)
        sys.stderr.write("  !! Error: Unknown object label " + label + "\n")

    if segId in edgeMap:
        # This node has children - lookup replacement based on sorted labels
        # for edges to child nodes.
        childSegIds = edgeMap[ segId ]
        nodeRelationPairs = []
        horRelation = []
        noSubSupPairs = []
        subSupPairs = []
        for nextChildId in childSegIds:
            # Obtain the highest-valued label for the edge.
            childPrimitive =list( segPrimMap[ nextChildId ][0])[0]
            edgeLabels = lg.elabels[ (oneSegPrimitive, childPrimitive) ]
            labelValuePairs = sorted(edgeLabels.items(), key=byValue)
            (relation, value) = labelValuePairs[0]

            # DEBUG: remove HOR/R relations, separate SUB/SUP relations.
            # Add missing "Sub" "Sup" labels for CROHME 2013.
            # DEBUG: Separate undefined labels into the 'noSubSupPairs' note
            #   that this binds these undefined relationships before any hor.
            #   adjacency relationship.
            if not (relation == 'HOR' or relation == 'R' or relation == "Right"):
                nodeRelationPairs += [ (nextChildId, relation) ]
                if not (relation == 'SUB' or relation == 'SUP' or \
                        relation == 'Sub' or relation == 'Sup' or
                        not relation in list(structureMap) and not relation == 'I' and not relation == 'Inside' ):
                    noSubSupPairs += [ (nextChildId, relation) ]
                else:
                    subSupPairs += [ (nextChildId, relation) ]
            
            else:
                horRelation += [ (nextChildId, relation) ]


        # CASE 1: all relations other than HOR/R are in a structure.
        strString = translateStructure(lg, label, nodeRelationPairs, structureMap,\
                        segPrimMap, edgeMap, symbolMap, segId, nodeString)
        if not strString == "":
            nodeString = strString 
        else:
            # CASE 2: only non-SUP/SUB relations are in a structure.
            strString = translateStructure(lg, label, noSubSupPairs, structureMap,\
                            segPrimMap, edgeMap, symbolMap, segId, nodeString)
            if not strString == "":
                nodeString = strString
                for (nextChildId, relation) in sorted(subSupPairs, key=byValue):
                    nodeString = translateRelation(lg, relation, nextChildId,
                                    structureMap, segPrimMap, edgeMap, symbolMap, nodeString)

                    #nodeString += translateRelation(lg, (relation, nextChildId),\
                    #		structureMap, segPrimMap, edgeMap, symbolMap)
            else:
                # DEFAULT: map relations independently.
                for (nextChildId, relation) in sorted(nodeRelationPairs, key=byValue):
                    nodeString = translateRelation(lg, relation, nextChildId,
                                    structureMap, segPrimMap, edgeMap, symbolMap, nodeString)

        # Lastly, generate string for adjacent symbols on the baseline.
        # **if there are multiple 'HOR' symbols all will be mapped.
        for (child, relation) in horRelation:
            nodeString = translateRelation(lg, relation, child,
                            structureMap, segPrimMap, edgeMap, symbolMap, nodeString)

    return nodeString

def cleanRows( mmlFile ):
    # with open( filePath ) as mmlFile:
    rowSoup = BeautifulSoup(mmlFile, 'html.parser')

    mrows = rowSoup.find_all('mrow')

    for item in mrows:
        if item.parent.name == "mrow":
                # REMOVING to avoid unexpected cases.
                #or len( item.contents ) < 2:
            item.unwrap()

    return rowSoup.prettify()

def lg2mml(lg_file, mapFile):

    lg = Lg(lg_file)

    # Hide the unlabeled edges.
    lg.hideUnlabeledEdges()

    (segmentPrimitiveMap, primitiveSegmentMap, noparentSegments, segmentEdges) = \
                            lg.segmentGraph()
    (rootNodes, treeEdges, otherEdges) = lg.separateTreeEdges()

    # Default symbol and structure mappings.
    symbolMap = { }
    structureMap = { }

    if mapFile:
        (symbolMap, structureMap) = readMapFile(mapFile)

    # Create a map from nodes to child nodes, in order to be able to
    # detect structures such as fractions, etc.
    treeEdgeMap = {}
    for (parent, child) in treeEdges:
        if parent in treeEdgeMap:
            treeEdgeMap[ parent ] += [ child ]
        else:
            treeEdgeMap[ parent ] = [ child ]

        # NOTE: currently this will print out more than one expression on 
        # separate lines if a graph has multiple root nodes.
        
    # Exit if there is no root node, generate a list of TeX expressions if there are
    # multiple root nodes.
    if len(rootNodes) < 1:
        sys.stderr.write("  !! Error: graph contains no root node; cannot generate output.\n")
        sys.exit(1)
    elif len(rootNodes) > 1:
        sys.stderr.write("  !! Graph contains " + str(len(rootNodes)) + " root nodes.\n")

    mml_out_raw = []
    for root in rootNodes:
        # print(translate(lg, root, segmentPrimitiveMap, treeEdgeMap,\
        # 		symbolMap, structureMap))
        mml_out_raw.append(translate(lg, root, segmentPrimitiveMap, 
                                treeEdgeMap, symbolMap, structureMap))
    # mml_out = postprocess("\n".join(mml_out_raw)) 
    mml_out = cleanRows("\n".join(mml_out_raw)) 
    return mml_out

def preprocess(filename, translateFile, translate=True):
    lg = Lg(filename)
    print(lg)
    lg_NE_string = lg.csv()
    if translate:
        symbolsMap, relationsMap = readtranslateFile(translateFile)
        edge_pattern = re.compile(r'^E,')
        node_pattern = re.compile(r'(^N,)|#')
        symbol_temp = relabel(lg_NE_string, symbolsMap, node_pattern)
        edge_temp = relabel(lg_NE_string, dict(**symbolsMap, **relationsMap), 
                            edge_pattern)
        lg_NE_string = symbol_temp + "\n" + edge_temp
    temp_file = "temp.lg"
    with open(temp_file, 'w') as f:
        f.writelines(lg_NE_string)
    return temp_file
        
def relabel(lg_NE_string, Map, pattern):
    temp = '\n'.join([line for line in lg_NE_string.split('\n') 
                       if re.match(pattern, line)])
    for source_label, mapped_label in Map.items():
        temp = temp.replace("," + source_label + ",", "," + mapped_label + ",")
    return temp

def main(lg_file, mapFile, translateFile):
    temp_lg_file = preprocess(lg_file, translateFile)
    mml_out = lg2mml(temp_lg_file, mapFile)
    os.remove(temp_lg_file)
    return mml_out

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: [[python]] lg2txt.py <infile.lg> [mapfile.csv] [translatefile.csv]")
        print("")
        print("   Produces a text file for label graph file")
        print("   <infile.lg>. A symbol and structure map file (mapfile.csv)")
        print("   may be provided to override default (latex) mappings.")
        sys.exit()
    lg_file = sys.argv[1]
    if len(sys.argv) > 3:
        mapFile = sys.argv[2]
        translateFile = sys.argv[3]
    elif len(sys.argv) > 2:
        mapFile = sys.argv[2]
        translateFile = None
    else:
        mapFile = None
    mml_out = main(lg_file, mapFile, translateFile)
    print(mml_out)