Cleaning up scripts evaluate + confHist

d060e2b4 · rlaz · 7be31d33 · d060e2b4 · d060e2b4 · d060e2b4
--- a/bin/confHist
+++ b/bin/confHist
@@ -7,11 +7,13 @@ usage()
 	echo -e "\t\t[-p|--dotpdfDir <directory>] [-h|--help]"
 	echo ""
  echo "------- Required Arguments -------"
+  echo "Note: Use either the output and target directories, or the fileList"
+  echo ""
  echo "output_dir    				Output lg files directory"
  echo "target_dir    				Ground truth lg files directory"
 	echo "fileList      				File whose each line contains outputfile_path targetfile_path"
 	echo -e "\t\t\t\t\tis used for comparison."
-	echo "Note: Use either the 2 directories or the fileList"
+	echo ""
 	echo -e "-gs or --graphSize <value> \t\tThe number of objects/primitives in targets to analyze"
  echo "" 
  echo "------- Optional Arguments -------"
@@ -32,22 +34,28 @@ usage()
 if [ $# -eq 0 ]
 then
 	echo "LgEval confHist: Structure Confusion Histogram Generator"
-	echo "Copyright (c) R. Zanibbi, H. Mouchere, 2013-2014"
+	echo "Copyright (c) R. Zanibbi, H. Mouchere, A.K. Shah 2013-2022"
 	echo ""
-	echo "Usage: confHist (output_dir target_dir) | fileList -gs|--graphSize <value>"
-	echo -e "\t\t[-m|--minCount <value>] [-s|--strokes] [-i|--lgimgDir <directory>]"
+	echo "Usage: confHist (output_dir target_dir) | fileList"
+	echo "               -gs|--graphSize <value> -m|--minCount <value>] [-s|--strokes]"
+	echo "               [-i|--lgimgDir <directory>]"
 	# echo -e "\t\t[-p|--dotpdfDir <directory>] [--split] [--filter] [-h|--help]"
-	echo -e "\t\t[-p|--dotpdfDir <directory>] [-h|--help]"
+	echo "               [-p|--dotpdfDir <directory>] [-h|--help]"
+	echo ""
 	echo "For details on arguments usage: confHist -h or confHist --help"
 	echo ""
-	echo "Creates an .html file containing structure confusion histograms"
-	echo "at the object level. The histograms visualize errors by their"
-	echo "frequency when comparing files in output_dir vs. target_dir (target_dir is 'ground truth')."
+	echo "Creates an .html file containing structure confusion histograms at the object level."
+	echo "The histograms visualize errors by their frequency when comparing files in output_dir"
+	echo "vs. target_dir (target_dir is 'ground truth')."
+	echo ""
 	echo "It is assumed that every .lg file in output_dir exists in target_dir, and a file"
 	echo "output_dir_vs_target_dir is created as output."
 	echo ""
-	echo "Output is written to the file confHist_outputs/CH_<output_dir_vs_target_dir__size_<graphSize>_min_<minCount>.html"
-	echo "or confHist_outputs/CH_<fileList__size_<graphSize>_min_<minCount>.html, depending upon the arguments used."
+	echo "Output is written to the file:"
+	echo "  * confHist_outputs/CH_<output_dir_vs_target_dir__size_<graphSize>_min_<minCount>.html *OR*"
+	echo "  * confHist_outputs/CH_<fileList__size_<graphSize>_min_<minCount>.html"
+	echo ""
+	echo "depending upon the arguments used."
 	exit 0
 fi

@@ -134,7 +142,7 @@ then
 	ls $output_dir/*.lg > _f1
 	ls $target_dir/*.lg > _f2

-	L1=`wc -l _f1 | awk '{print $1}'`
+	L1=`wc -l _f1 | awk '{print $1}'` 
 	L2=`wc -l _f2 | awk '{print $1}'`
 	if [ "$L1" != "$L2" ]
 	then
@@ -160,4 +168,3 @@ else
 		--dotpdfDir $DOTPDF_DIR --split $SPLIT --filter $FILTER
 fi

-exit 0
--- a/bin/evaluate
+++ b/bin/evaluate
@@ -13,7 +13,7 @@
 if [ $# -lt 1 ]
 then
 	echo "LgEval evaluate: Label graph evaluation tool"
-	echo "Copyright (c) R. Zanibbi, H. Mouchere, 2012-2014"
+	echo "Copyright (c) R. Zanibbi, H. Mouchere, M. Mahdavi, A.K. Shah 2012-2022"
 	echo ""
 	echo "Usage: evaluate outputDir groundTruthDir [p/t/d/s/b] [png/pdf/both] OR"
 	echo "       evaluate fileList [p/t/d/s/b] [png/pdf/both]"
@@ -51,17 +51,30 @@ BNAME=`basename $1`
 MODE="Dir"
 FORMAT="pdf"
 TARGETS=""
+TARGET_COUNT=0
 OUTPUTS=""
+
+################################################################
+# Compile the list of output files and ground truth files.
+#
+# NOTE: Ground truth files define the evaluation set, extra 
+# output files are ignored.
+################################################################
+
+# Case 1: Passed a list of file pairs
 if ! [ -d $1 ]
 then
+	MODE="List"
+
 	LABEL_STRING="List File: $1"
 	echo "$LABEL_STRING"
-	MODE="List"
+
 	# Get the targets
 	OUTPUTS=`awk '{ print $1; }' $1`
 	OUTARR=($OUTPUTS)
 	TARGETS=`awk '{ print $2; }' $1`

+	# Grab additional flags
 	if [ $# -gt 1 ]
 	then
 		DOTARG=$2
@@ -70,35 +83,46 @@ then
 	then
 		FORMAT=$3
 	fi
+
+# Case 2: Passed a pair of directories
 else
-	# Peculiar '$<string>' styntax is to preserve the newline.
 	OUT_STRING="Output File Directory:  $1"
 	GT_STRING="Ground Truth Directory: $2"
+	# Peculiar '$<string>' syntax is to preserve the newline.
 	LABEL_STRING=$(printf '%s\n%s' "$OUT_STRING" "$GT_STRING")
 	echo "$LABEL_STRING"

 	OUTPUTS=`ls $1/*.lg`
 	TARGETS=`ls $2/*.lg`

+	# Grab additional flags
 	if [ $# -gt 2 ]
 	then
 		DOTARG=$3
 	fi
+
+	# RZ: Debug -- output type ignored
+	if [ $# -gt 3 ]
+	then
+		FORMAT=$4
+	fi
 fi
-echo ""

+TARGET_COUNT=$((`echo $TARGETS | wc -w`))
+
+################################################################
+# Create output directory structure, compile class labels
+################################################################
 ResultsDir=Results_$BNAME
 if ! [ -d $ResultsDir ]
 then
 	mkdir $ResultsDir
 	mkdir $ResultsDir/Metrics

+	# Create directories for dot error visualizations
 	if [ "$DOTARG" != "" ]
 	then
-		if [ $# -gt 3 ]
-		then
-			FORMAT=$4
-		fi
+		# RZ Debug: deleting FORMAT assignment (done above)
 		mkdir $ResultsDir/errorGraphs
 		mkdir $ResultsDir/errorGraphs/dot
 		if [ "$FORMAT" == "pdf" ]; then
@@ -113,22 +137,28 @@ then
 fi


-# Compute all .csv metrics outputs (per-file), and .diff results (per-file).
-echo "Evaluating files..."
-
 # Compile labels from ground truth. This is needed for confusion matrices to
-# be properly defined.
+# be properly defined, and for sanity checking results.
 echo "$TARGETS" > $ResultsDir/tfileTarget
 python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/tfileTarget" > "$ResultsDir/labelsGT.txt"
-
-# Do the same for outputs. This can come handy in many places.
 echo "$OUTPUTS" > $ResultsDir/tfileTarget
 python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/tfileTarget" > "$ResultsDir/labelsOutput.txt"
 rm $ResultsDir/tfileTarget

+
+################################################################
+# Evaluate files
+################################################################
+
+# Compute all .csv metrics outputs (per-file), and .diff results (per-file).
+echo ""
+echo "Evaluating..."
+
+# Iterate over ground truth files
 INDEX=0
 for file in $TARGETS
 do
+
 	FNAME=`basename $file .lg`
 	nextFile="_ERROR_"
 	if [ $MODE == "Dir" ]
@@ -143,13 +173,23 @@ do
 	then
 		# NOTE: the script convertCrohmeLg can be used to convert
 		#       crohme .inkml files to .lg files.
-		echo "  >> Comparing $FNAME.lg"
-
-		python3 $LgEvalDir/src/evallg.py $nextFile $file m INTER > $ResultsDir/Metrics/$FNAME.csv
-		DIFF=`python3 $LgEvalDir/src/evallg.py $nextFile $file diff INTER`
 		CORRECT="Correct"
+		#echo -ne "  >> Comparing $FNAME.lg"
+
+		# RZ: Repairing to avoid running evaluation twice.
+		python3 $LgEvalDir/src/evallg.py $nextFile $file INTER > $ResultsDir/Metrics/$FNAME.csv
+		METRICS=`grep -v "\*" $ResultsDir/Metrics/$FNAME.csv`
+		echo $METRICS > $ResultsDir/Metrics/$FNAME.csv
+		DIFF=`grep "\*" $ResultsDir/Metrics/$FNAME.csv`
+		#echo "$METRICS"
+		#read V
+		#echo "$DIFF"
+		#read V
+
+		# If differences reported, record them
 		if [ -n "$DIFF" ]
 		then
+			CORRECT="Incorrect"
 			echo "$DIFF" > $ResultsDir/Metrics/$FNAME.diff 

 			# If a third argument is provided, generate a .pdf file to visualize
@@ -172,8 +212,6 @@ do
 					mv $FNAME.png $ResultsDir/errorGraphs/png
 				fi
 			fi
-			
-			CORRECT="Incorrect"
 		else
 			rm -f $ResultsDir/Metrics/$FNAME.diff
 		fi
@@ -185,10 +223,16 @@ do
 	fi

 	INDEX=$((INDEX+1))
+	PERCENT=`echo "scale=1; 100 * $INDEX / $TARGET_COUNT" | bc`
+	
+	echo -ne "  $PERCENT% complete ($INDEX of $TARGET_COUNT)\r"
 done

-# Compile all metrics/diffs,
-# and then compute metric summaries and confusion matrices.
+################################################################
+# Compile metrics
+# Including summaries and confusion matrices
+################################################################
+
 cat $ResultsDir/Metrics/*.csv > $ResultsDir/$BNAME.csv
 ALLDIFFS=`ls $ResultsDir/Metrics | grep .diff`
 if [ -n "$ALLDIFFS" ]
@@ -203,6 +247,10 @@ python3 $LgEvalDir/src/sumMetric.py "$LABEL_STRING" $ResultsDir/$BNAME.csv > $Re
 python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt html > $ResultsDir/ConfusionMatrices.html
 python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt  > $ResultsDir/ConfusionMatrices.csv

+
+################################################################
+# Create spreadsheet
+################################################################
 # RZ Oct. 2014: Create spreadsheet pairing file names with metrics.
 # Clean up raw metric data to make the file smaller and simpler.
 # Use awk and head to select every odd (headers) and even (data) columns,
@@ -213,7 +261,6 @@ awk -F',' '{ for (i=1;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}
 head -n 1 $ResultsDir/Headers.csv > $ResultsDir/HeaderRow.csv
 HEAD=`cat $ResultsDir/HeaderRow.csv`
 echo "File,Result,$HEAD" > $ResultsDir/HeaderRow.csv
-
 awk -F',' '{ for (i=2;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Data.csv

 # Combine file names with raw data metrics, then add header labels.
@@ -223,10 +270,8 @@ cat $ResultsDir/HeaderRow.csv $ResultsDir/DataNew.csv > $ResultsDir/FileMetrics.
 # Clean up
 rm -f $ResultsDir/Headers.csv $ResultsDir/HeaderRow.csv $ResultsDir/Data.csv
 rm -f $ResultsDir/DataNew.csv $ResultsDir/FileResults.csv
-
-# Remove the compiled metrics and differences, but leave the individual metric/diff
-# files in Metrics to support debugging for malformed or missing files, etc.
 rm -f $ResultsDir/$BNAME.csv $ResultsDir/$BNAME.diff

+echo ""
 echo "done."

--- a/src/confHists.py
+++ b/src/confHists.py
@@ -37,6 +37,7 @@ def main(
    pdf_count = 0
    if os.path.exists(dotpdf_dir):
        pdf_count = len(glob(os.path.join(dotpdf_dir, "*.pdf")))
+
    if pdf_count == 0:
        dotpdf_dir = "confHist_outputs/dotpdfs"
        print(
@@ -46,6 +47,7 @@ def main(
        )
        if not os.path.exists(dotpdf_dir):
            os.makedirs(dotpdf_dir)
+
    for row in fileReader:
        # Skip comments and empty lines.
        if not row == [] and not row[0].strip()[0] == "#":
@@ -128,7 +130,7 @@ def main(
        '<script src="https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/1.0.0/FileSaver.min.js"></script>\n'
    )

-    # (Excuse the messs..) create callbacks for checkbox events, save button
+    # (Excuse the mess..) create callbacks for checkbox events, save button
    # which saves the unique list of selected files in sorted order.
    # This was a slow, painful way to do this - perhaps an 'include' would be better.
    JS_DIR = os.path.join(fileList_head, "js")
@@ -362,10 +364,7 @@ def parse_args():

 if __name__ == "__main__":
    args = parse_args()
-    # print(args)
-    # img_dir = '../../../../data/test2019_inkml2img'  # 3branch...
-    # img_dir = '../../../../data/infty/IMG'   # infty_contour...
-    # img_dir = '../../../Data/Expressions/IMG'   # lpga_rf...
+    
    img_dir = os.path.join("../..", args.lgimgDir)
    main(
        args.fileList,