sumDiff.py

################################################################
# sumDiff.py
#
# Program that reads in a .diff (CSV) file containing differences between
# to BG, and outputs confusion matrix for symbols and spatial relations.
# Output is in CSV or HTML formats.
# 
# Author: H. Mouchere, June 2012
# Copyright (c) 2012-2014, Richard Zanibbi and Harold Mouchere
################################################################
import sys
import csv
import collections
import time
import os

def addOneError(confM, id1, id2):
	#thanks to "defaultdict" there is nothing to do !
	confM[id1][id2] += 1

def affMat(output, allID, confM):
	# Header
        output.write("Output:")
	for k in sorted(allID):
		output.write(",'"+str(k)+"'")
	output.write("\n")
	
	# Data
	for k1 in sorted(allID):
		output.write("'"+str(k1)+"'")
		for k2 in sorted(allID):
			if not confM[k1][k2] == 0:
				output.write(","+str(confM[k1][k2]))
			else:
				output.write(",")
		output.write("\n")

def affMatHTML(output, allID, confM):
        output.write("<table>\n<tr><th><i>(Out:Rows)</i></th>")
	for k in sorted(allID):
		output.write("<th>"+str(k)+"</th>")
	output.write("</tr>\n")
	for k1 in sorted(allID):
		output.write("<tr><th>"+str(k1)+"</th>")
		i = 0
		for k2 in sorted(allID):
			val = str(confM[k1][k2])
			if val == "0":
				val = ""
			output.write('<td class="col_'+str(i)+'">'+val+"</td>")
			i = i+1
		output.write("<th>"+str(k1)+"</th></tr>\n")
	output.write("<tr><th></th>")
	for k in sorted(allID):
		output.write("<th>"+str(k)+"</th>")
	output.write("</tr>\n")		
	output.write("</table>\n")

def writeCSS(output, allID):
	output.write('<head><style type="text/css">\n')
	output.write('table { border-collapse:collapse;}\n')
	output.write('p { line-height: 125%;}\n')
	output.write('ul { line-height: 125%;}\n')
	output.write('th{ text-align: right; padding: 4px;}\n')
	output.write('td { text-align: right; border: 1px solid lightgray; padding: 4px; }\n')
        
        #output.write('h2 {	color: red;}\n')
	output.write('tr:hover{background-color:rgb(180,200,235);}\n ')
	#i = 0
	#for k1 in sorted(allID):
	#	output.write('td.col_'+str(i)+':hover {\nbackground-color:rgb(100,100,255);\n}\n')
	#	i = i+1
	output.write('td:hover{background-color:yellow;} \n')
	output.write('</style></head>\n')

def main():
	if len(sys.argv) < 3:
		print("Usage : [[python]] sumDiff.py <file1.diff> <labelsGT.txt> [HTML]\n")
		print("	Merge results for each line in file1.diff into confusion Matrices.")
		print("	By default output is sent to stdout in CSV format.")
		print(" requires list of GT labels from labelsGT.txt.")
		print("	[HTML] option changes output format to HTML.")
		sys.exit(0)
	# Read data from CSV file.
	fileName = sys.argv[1]
	labelFile = sys.argv[2]
	try:
		fileReader = csv.reader(open(fileName))
	except:
		sys.stderr.write('  !! IO Error (cannot open): ' + fileName)
		sys.exit(1)

	try:
		labelfileReader = csv.reader(open(labelFile))
	except:
		sys.stderr.write('  !! IO Error (cannot open): ' + fileName)
		sys.exit(1)

	# Read for node and edge label sets.
	readEdges = False
	gtNodeLabels = set()
	gtEdgeLabels = set()
	for row in labelfileReader:
		if len(row) == 0:
			continue
		
		nextEntry = row[0].strip()
		if nextEntry == 'NODE LABELS:':
			continue
		elif nextEntry == 'EDGE LABELS:':
			readEdges = True
		else:
			if readEdges:
				gtEdgeLabels.add(nextEntry)
			else:
				gtNodeLabels.add(nextEntry)

	withHTML = False
	if len(sys.argv) > 3:
		withHTML = True
	#confusion matrix = dict->dict->int
	labelM = collections.defaultdict(collections.defaultdict(int).copy)
	spatRelM = collections.defaultdict(collections.defaultdict(int).copy)
	#segRelM = collections.defaultdict(collections.defaultdict(int).copy)
	
	allLabel = set()
	allSR = set()
	rowCount = -1

	# Idenfity all confused symbol labels. We will use this to
	# present relationship and segmentation confusions separately.
	symbolLabels = set([])

	nodeErrors = 0
	allSegErrors = 0
	allRelErrors = 0
	fposMerge = 0
	fnegMerge = 0

	for row in fileReader:
		rowCount += 1

		# Skip blank lines.
		if len(row) == 0:
			continue

		entryType = row[0].strip()
		#skip file names
		if entryType == "DIFF":
			continue
		#process node label errors
		elif entryType == "*N":
			# Capture all confused symbol (node) labels.
			symbolLabels.add(row[2].strip())
			symbolLabels.add(row[5].strip())

			addOneError(labelM,row[2].strip(),row[5].strip())
			allLabel.add(row[2].strip())
			allLabel.add(row[5].strip())

			nodeErrors += 1

		#process link errors
		elif entryType == "*E":
			# DEBUG
			if row[3].strip() == "1.0" or row[6].strip() == "1.0":
				print("ERROR at row: " + str(rowCount) + " for file: " + fileName)
				print(row)
			elif not len(row) == 8:
				print("INVALID LENGTH at row: " + str(rowCount) + " for file: " + fileName)
				print(row)
			
			outputLabel = row[3].strip()
			otherLabel = row[6].strip()
			addOneError(spatRelM, outputLabel, otherLabel)

			allSR.add(outputLabel)
			allSR.add(otherLabel)

		elif entryType == "*S":
			# Currently ignore segmentation errors (i.e. object-level errors)
			continue
		
	# Obtain the list of edge labels that do not appear on nodes.
	# DEBUG: need to consult all GT labels in general case (handling '*' input).
	mergeEdgeLabel = '*'
	relOnlyLabels = allSR.difference(symbolLabels).difference(gtNodeLabels)
	relMergeLabels = relOnlyLabels.union(mergeEdgeLabel)

	# Create a modified confusion histogram where all symbol/segmentation
	# edge confusions are treated as being of the same type.
	ShortEdgeMatrix = collections.defaultdict(collections.defaultdict(int).copy)
	for output in spatRelM.keys():
		olabel = output
		if not output in relOnlyLabels:
			olabel = mergeEdgeLabel

		for target in spatRelM[output].keys():
			tlabel = target
			if not target in relOnlyLabels:
				tlabel = mergeEdgeLabel

			# Increment the entry for the appropriate matrix.
			ShortEdgeMatrix[olabel][tlabel] += spatRelM[output][target]

			if not olabel == output or not tlabel == target:
				allSegErrors += spatRelM[output][target]
				if not olabel == output and tlabel == target:
					fposMerge += spatRelM[output][target]
				elif not tlabel == target and olabel == output:
					fnegMerge += spatRelM[output][target]
			else:
				allRelErrors += spatRelM[output][target]

	if withHTML:
		sys.stdout.write('<html>')
		writeCSS(sys.stdout, allLabel.union(allSR))
		print("<font face=\"helvetica,arial,sans-serif\">")
		print("<h2>LgEval Error Summary</h2>")
		print(time.strftime("%c"))
		print("<br>\n")
		print("<b>File:</b> " + os.path.splitext( os.path.split(fileName)[1] )[0] + "<br>")
		print("<p>All confusion matrices show only errors. In each matrix, output labels appear in the left column, and target labels in the top row.</p>")
		print("<UL><LI><A href=\"#nodes\">Node Label Confusion Matrix</A> <LI> <A HREF=\"#ShortEdges\">Edge Label Confusion Matrix (short - ignoring object class confusions)<A> <LI> <A HREF=\"#Edges\">Edge Label Matrix (all labels)</A> </UL>")
		print ("<hr>")
		print ("<h2><A NAME=\"nodes\">Node Label Confusion Matrix</A></h2>")
		print ("<p>"+str(len(allLabel)) + " unique node labels. " + str(nodeErrors) + " errors. ABSENT: a node missing in the output or target graph</p>")
		affMatHTML(sys.stdout, allLabel, labelM)
		print("<br><hr><br>")
		print ("<h2><A NAME=\"ShortEdges\">Edge Label Confusion Matrix (Short)</A></h2>")
		print ("<p>" + str(len(relOnlyLabels)) + " unique relationship labels + * representing grouping two nodes into an object (any type). " + str(allSegErrors + allRelErrors) + " errors <UL><LI>" + str(allSegErrors) + " Directed segmentation and node pair classification errors (entries in '*'-labeled row and column) <UL><LI><b>" + str(allSegErrors - fposMerge - fnegMerge) + " edges between correctly grouped nodes, but with conflicting classification (* vs. *)</b> <LI>" + str(fposMerge) + " false positive merge edges (* vs. other)<LI>" + str(fnegMerge) + " false negative merge edges (other vs. *) </UL>  <LI>" + str(allRelErrors) + " Directed relationship errors (remaining matrix entries) </UL></p></p>")
		affMatHTML(sys.stdout, relMergeLabels, ShortEdgeMatrix)
		#affMatHTML(sys.stdout, relOnlyLabels, spatRelM)
		
		print("<br><hr><br>")
		print("<h2><A NAME=\"Edges\">Edge Label Confusion Matrix (All Errors)</A></h2>")
		print("<p>"+str(len(allSR)) + " unique edge labels representing relationships and node groupings for specific symbol types. " + str(allSegErrors + allRelErrors) + " errors</p>")
		affMatHTML(sys.stdout, allSR, spatRelM)
		
		print("</font>")
		sys.stdout.write('</html>')
	else:
		print("LgEval Error Summary for: "+fileName)
		print(time.strftime("%c"))
		print("")
		print("NOTE: This file contains 3 confusion matrices.")
		print("")
		print("I. Node Label Confusion Matrix: " + str(len(allLabel)) + " unique labels. ABSENT: a node missing in the output or target graph")
		affMat(sys.stdout, allLabel, labelM)
		
		print("")
		print("")
		print("II. Edge Label Confusion Matrix (Short): " + str(len(relOnlyLabels)) + " unique relationship labels + * (merge)")
		affMat(sys.stdout, relMergeLabels, ShortEdgeMatrix)
		
		print("")
		print("")
		print("III. Edge Label Confusion Matrix (Full): " + str(len(allSR)) + " unique labels for relationships and node groupings for specific symbol types")
		affMat(sys.stdout, allSR, spatRelM)

main()