evaluate

#!/bin/bash

# Make sure that CROHMELibDir and LgEvalDir are defined in
# your shell enviroment, e.g. by including:
#	
#	export LgEvalDir=<path_to_LgEval>
#	export CROHMELibDir=<path_to_CROHMELib>       		
#	export PATH=$PATH:$CROHMELibDir/bin:$LgEvalDir/bin
# 
# in your .bashrc file (the initialization file for bash shell). The PATH
# alteration will add the tools to your search path. 

if [ $# -lt 1 ]
then
	echo "LgEval evaluate: Label graph evaluation tool"
	echo "Copyright (c) R. Zanibbi, H. Mouchere, M. Mahdavi, A.K. Shah 2012-2022"
	echo ""
	echo "Usage: evaluate outputDir groundTruthDir [p/t/d/s/b] [png/pdf/both] OR"
	echo "       evaluate fileList [p/t/d/s/b] [png/pdf/both]"
	echo ""
	echo "Evaluates label graph (.lg) files in outputDir against the same files"
	echo "in groundTruthDir. groundTruthDir defines the list of files to be compared"
	echo "(i.e. if a file is not in the ground truth directory, it is ignored."
	echo ""
	echo "If a list of file pairs is provided ('output target' provided on each line)"
	echo "then these file pairs are used for evaluation."
	echo ""
	echo "The final optional arguments define the graph type to use in visualizing"
	echo "errors, and their output format. Run 'lg2dot' for more on graph types."
	exit 0
fi

DOTARG=""
BNAME=`basename $1`
MODE="Dir"
FORMAT="pdf"
TARGETS=""
TARGET_COUNT=0
OUTPUTS=""
NL=$'\n'

OUTCOME_LIST=""
ResultsDir=Results_$BNAME
MULTI_PASS_WARN=0

################################################################
# Compile the list of output files and ground truth files.
#
# NOTE: Ground truth files define the evaluation set, extra 
# output files are ignored.
################################################################

echo ""
echo "[ LgEval evaluate ]"
echo ""

# Case 1: Passed a list of file pairs
if ! [ -d $1 ]
then
	MODE="List"

	LABEL_STRING="List File: $1"
	echo "$LABEL_STRING"

	# Get the targets
	OUTPUTS=`awk '{ print $1; }' $1`
	OUTARR=($OUTPUTS)
	TARGETS=`awk '{ print $2; }' $1`

	# Grab additional flags
	if [ $# -gt 1 ]
	then
		DOTARG=$2
	fi
	if [ $# -gt 2 ]
	then
		FORMAT=$3
	fi

# Case 2: Passed a pair of directories
else
	OUT_STRING="Output File Directory:  $1"
	GT_STRING="Ground Truth Directory: $2"
	# Peculiar '$<string>' syntax is to preserve the newline.
	LABEL_STRING=$(printf '%s\n%s' "$OUT_STRING" "$GT_STRING")
	echo "$LABEL_STRING"

	OUTPUTS=`ls $1/*.lg`
	TARGETS=`ls $2/*.lg`

	# Grab additional flags
	if [ $# -gt 2 ]
	then
		DOTARG=$3
	fi

	# RZ: Debug -- output type ignored
	if [ $# -gt 3 ]
	then
		FORMAT=$4
	fi
fi
echo "* LgEval Results Directory: $ResultsDir"

TARGET_COUNT=$((`echo $TARGETS | wc -w`))


################################################################
# Create output directory structure, compile class labels
################################################################
if ! [ -d $ResultsDir ]
then
	mkdir $ResultsDir
	mkdir $ResultsDir/Metrics

	# Create directories for dot error visualizations
	if [ "$DOTARG" != "" ]
	then
		# RZ Debug: deleting FORMAT assignment (done above)
		mkdir $ResultsDir/errorGraphs
		mkdir $ResultsDir/errorGraphs/dot
		if [ "$FORMAT" == "pdf" ]; then
			mkdir $ResultsDir/errorGraphs/pdf
		elif [ "$FORMAT" == "png" ]; then
			mkdir $ResultsDir/errorGraphs/png
		elif [ "$FORMAT" == "both" ]; then
			mkdir $ResultsDir/errorGraphs/pdf
			mkdir $ResultsDir/errorGraphs/png
		fi
	fi
fi

# Compile labels from ground truth. This is needed for confusion matrices to
# be properly defined, and for sanity checking results.
echo "$TARGETS" > $ResultsDir/temp_file_list
python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/temp_file_list" > "$ResultsDir/labelsGT.txt"
echo "$OUTPUTS" > $ResultsDir/temp_file_list
python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/temp_file_list" > "$ResultsDir/labelsOutput.txt"
rm $ResultsDir/temp_file_list


################################################################
# Evaluate files
################################################################

# Compute all .csv metrics outputs (per-file), and .diff results (per-file).
echo ""
echo "Evaluating..."

# Iterate over ground truth files
INDEX=0
for file in $TARGETS
do
	FNAME=`basename $file .lg`
	nextFile="_ERROR_"
	if [ $MODE == "Dir" ]
	then
		nextFile=`echo "$1/$FNAME.lg" | perl -p -e "s/\/\//\//g"`
	else
		# Index to the next input file.
		nextFile=${OUTARR[INDEX]}
	fi

	if  [[ ! -e $ResultsDir/Metrics/$FNAME.csv ]]
	then
		# NOTE: the script convertCrohmeLg can be used to convert
		#       crohme .inkml files to .lg files.
		CORRECT="Correct"

		# RZ: Run evaluation once vs. twice
		OUT=`python3 $LgEvalDir/src/evallg.py $nextFile $file INTER`

		# Match asterisk at beginning of line to select differences/errors
		# WARNING: Double quotes are important to preserve newlines!
		DIFF=`echo "$OUT" | grep "\*"`
		echo "$DIFF" > $ResultsDir/Metrics/$FNAME.diff
		echo "$OUT" | grep -v "\*" > $ResultsDir/Metrics/$FNAME.csv

		# If differences reported, record files with errors, generate visualizations
		if [ "$DIFF" != "" ]
		then
			CORRECT="Incorrect"

			# If a third argument is provided, generate a .pdf file to visualize
			# differences between graphs.
			if [ "$DOTARG" != "" ]
			then
				if [ "$DOTARG" == "d" ]
				then
					lg2dot $nextFile $file --format $FORMAT
				else
					lg2dot $nextFile $file --graph_type "$DOTARG" --format $FORMAT
				fi

				mv $FNAME.dot $ResultsDir/errorGraphs/dot
				if [ "$FORMAT" == "pdf" ]; then
					mv $FNAME.pdf $ResultsDir/errorGraphs/pdf
				elif [ "$FORMAT" == "png" ]; then
					mv $FNAME.png $ResultsDir/errorGraphs/png
				elif [ "$FORMAT" == "both" ]; then
					mv $FNAME.pdf $ResultsDir/errorGraphs/pdf
					mv $FNAME.png $ResultsDir/errorGraphs/png
				fi
			fi
		fi

		# Record whether file was correct or not.
		if [ $((INDEX)) == 0 ]
		then
			OUTCOME_LIST="$nextFile, $CORRECT"
		else
			OUTCOME_LIST=`printf "%s\n%s" "$OUTCOME_LIST" "$nextFile, $CORRECT"`
		fi
	else
		if [ $((MULTI_PASS_WARN)) == 0 ]
		then
			echo "  * Already processed: $file"
			echo "    (message suppressed for other files)"
			MULTI_PASS_WARN=1
		fi
	fi

	INDEX=$((INDEX+1))
	PERCENT=`echo "scale=1; 100 * $INDEX / $TARGET_COUNT" | bc`
	
	if [ $((`expr $INDEX % 1`)) == 0 ]
	then
		echo -ne "  $PERCENT% complete ($INDEX of $TARGET_COUNT)\r"
	fi
done
echo -ne "  $PERCENT% complete ($INDEX of $TARGET_COUNT)\r"


################################################################
# Compile metrics 
# Including summaries and confusion matrices
#
# Stored as individual files to prevent re-computation for user
################################################################

if [ -n "$OUTCOME_LIST" ]
then
	# Need to avoid adding empty entries in Correct.csv, and sort by filename
	echo "$OUTCOME_LIST" >> $ResultsDir/Correct.csv
	sort -o $ResultsDir/Correct.csv $ResultsDir/Correct.csv
fi
cat $ResultsDir/Metrics/*.csv > $ResultsDir/$BNAME.csv

ALLDIFFS=`ls $ResultsDir/Metrics | grep .diff`
if [ -n "$ALLDIFFS" ]
then
	cat $ResultsDir/Metrics/*.diff > $ResultsDir/$BNAME.diff
else
	touch $ResultsDir/00_NoErrors
	touch $ResultsDir/$BNAME.diff  # empty - no errors.
fi

# Compute summaries 
python3 $LgEvalDir/src/sumMetric.py "$LABEL_STRING" $ResultsDir/$BNAME.csv > \
	$ResultsDir/Summary.txt
python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt html > \
	$ResultsDir/ConfusionMatrices.html
python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt  > \
	$ResultsDir/ConfusionMatrices.csv


################################################################
# Create FileMetrics.csv and summary spreadsheet
################################################################
# Use awk and head to select every odd (headers) and even (data) columns,
# Concatenate one header row with data contents.
awk -F',' '{ for (i=1;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Headers.csv
awk -F',' '{ for (i=2;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Data.csv

# Obtain first row for data labels; insert a "File" label in the first column.
head -n 1 $ResultsDir/Headers.csv > $ResultsDir/HeaderRow.csv
HEAD=`cat $ResultsDir/HeaderRow.csv`
echo "File,Result,$HEAD" > $ResultsDir/HeaderRow.csv

# Combine file names with raw data metrics, then add header labels.
paste -d , $ResultsDir/Correct.csv $ResultsDir/Data.csv > $ResultsDir/DataNew.csv
cat $ResultsDir/HeaderRow.csv $ResultsDir/DataNew.csv > $ResultsDir/FileMetrics.csv


##################################
# Clean up 
##################################
rm -f $ResultsDir/Headers.csv $ResultsDir/HeaderRow.csv $ResultsDir/Data.csv
rm -f $ResultsDir/DataNew.csv 
# RZ: not deleting Correct.csv, to insure that all files are present.
#rm -f $ResultsDir/Correct.csv
rm -f $ResultsDir/$BNAME.csv $ResultsDir/$BNAME.diff


##################################
# Remind user of outputs
##################################
echo ""
echo "done."
echo ""
echo "$ResultsDir/ contents:"
echo "   Summary.txt     --   Readable metrics summary"
echo "   Correct.csv     --   Records which files are correct/incorrect"
echo "   graphErrors/    --   Error graph visualizations (if requested)"
echo "   labelsOut.txt   --   Node & edge labels in output files"
echo "   labelsGT.txt    --   Node & edge labels in ground truth files"
echo "   FileMetrics.csv --   Raw metrics file"
echo "   ConfusionMatrices.html  -- Readable web page with confusion matrices (HTML)"
echo "   ConfusionMatrices.csv   -- Confusion matrix (CSV format)"
echo "   Metrics/                -- Individual file metrics (.csv) & differences (.diff)"
echo ""