Merge branch 'repair-io' into 'main'

Improved 'evaluate' script and README See merge request dprl/lgeval!5

Merge branch 'repair-io' into 'main'
Improved 'evaluate' script and README See merge request dprl/lgeval!5
4884979b · R · 7070867c · 58dc0a62 · 4884979b · 4884979b
--- a/README.md
+++ b/README.md
--- a/bin/confHist
+++ b/bin/confHist
@@ -7,11 +7,13 @@ usage()
 	echo -e "\t\t[-p|--dotpdfDir <directory>] [-h|--help]"
 	echo ""
  echo "------- Required Arguments -------"
+  echo "Note: Use either the output and target directories, or the fileList"
+  echo ""
  echo "output_dir    				Output lg files directory"
  echo "target_dir    				Ground truth lg files directory"
 	echo "fileList      				File whose each line contains outputfile_path targetfile_path"
 	echo -e "\t\t\t\t\tis used for comparison."
-	echo "Note: Use either the 2 directories or the fileList"
+	echo ""
 	echo -e "-gs or --graphSize <value> \t\tThe number of objects/primitives in targets to analyze"
  echo "" 
  echo "------- Optional Arguments -------"
@@ -32,22 +34,28 @@ usage()
 if [ $# -eq 0 ]
 then
 	echo "LgEval confHist: Structure Confusion Histogram Generator"
-	echo "Copyright (c) R. Zanibbi, H. Mouchere, 2013-2014"
+	echo "Copyright (c) R. Zanibbi, H. Mouchere, A.K. Shah 2013-2022"
 	echo ""
-	echo "Usage: confHist (output_dir target_dir) | fileList -gs|--graphSize <value>"
-	echo -e "\t\t[-m|--minCount <value>] [-s|--strokes] [-i|--lgimgDir <directory>]"
+	echo "Usage: confHist (output_dir target_dir) | fileList"
+	echo "               -gs|--graphSize <value> -m|--minCount <value>] [-s|--strokes]"
+	echo "               [-i|--lgimgDir <directory>]"
 	# echo -e "\t\t[-p|--dotpdfDir <directory>] [--split] [--filter] [-h|--help]"
-	echo -e "\t\t[-p|--dotpdfDir <directory>] [-h|--help]"
+	echo "               [-p|--dotpdfDir <directory>] [-h|--help]"
+	echo ""
 	echo "For details on arguments usage: confHist -h or confHist --help"
 	echo ""
-	echo "Creates an .html file containing structure confusion histograms"
-	echo "at the object level. The histograms visualize errors by their"
-	echo "frequency when comparing files in output_dir vs. target_dir (target_dir is 'ground truth')."
+	echo "Creates an .html file containing structure confusion histograms at the object level."
+	echo "The histograms visualize errors by their frequency when comparing files in output_dir"
+	echo "vs. target_dir (target_dir is 'ground truth')."
+	echo ""
 	echo "It is assumed that every .lg file in output_dir exists in target_dir, and a file"
 	echo "output_dir_vs_target_dir is created as output."
 	echo ""
-	echo "Output is written to the file confHist_outputs/CH_<output_dir_vs_target_dir__size_<graphSize>_min_<minCount>.html"
-	echo "or confHist_outputs/CH_<fileList__size_<graphSize>_min_<minCount>.html, depending upon the arguments used."
+	echo "Output is written to the file:"
+	echo "  * confHist_outputs/CH_<output_dir_vs_target_dir__size_<graphSize>_min_<minCount>.html *OR*"
+	echo "  * confHist_outputs/CH_<fileList__size_<graphSize>_min_<minCount>.html"
+	echo ""
+	echo "depending upon the arguments used."
 	exit 0
 fi

@@ -134,7 +142,7 @@ then
 	ls $output_dir/*.lg > _f1
 	ls $target_dir/*.lg > _f2

-	L1=`wc -l _f1 | awk '{print $1}'`
+	L1=`wc -l _f1 | awk '{print $1}'` 
 	L2=`wc -l _f2 | awk '{print $1}'`
 	if [ "$L1" != "$L2" ]
 	then
@@ -160,4 +168,3 @@ else
 		--dotpdfDir $DOTPDF_DIR --split $SPLIT --filter $FILTER
 fi

-exit 0
--- a/bin/evaluate
+++ b/bin/evaluate
@@ -13,36 +13,20 @@
 if [ $# -lt 1 ]
 then
 	echo "LgEval evaluate: Label graph evaluation tool"
-	echo "Copyright (c) R. Zanibbi, H. Mouchere, 2012-2014"
+	echo "Copyright (c) R. Zanibbi, H. Mouchere, M. Mahdavi, A.K. Shah 2012-2022"
 	echo ""
 	echo "Usage: evaluate outputDir groundTruthDir [p/t/d/s/b] [png/pdf/both] OR"
 	echo "       evaluate fileList [p/t/d/s/b] [png/pdf/both]"
 	echo ""
-	echo "Evaluates all label graph (.lg) files in outputDir against"
-	echo "corresponding files in groundTruthDir. groundTruthDir is used"
-	echo "to generate the list of files to be compared (i.e. if a file is"
-	echo "not in the ground truth directory, it will not be considered)."
+	echo "Evaluates label graph (.lg) files in outputDir against the same files"
+	echo "in groundTruthDir. groundTruthDir defines the list of files to be compared"
+	echo "(i.e. if a file is not in the ground truth directory, it is ignored."
 	echo ""
-	echo "If a list of file pairs is provided instead ('output target' on each line)"
+	echo "If a list of file pairs is provided ('output target' provided on each line)"
 	echo "then these file pairs are used for evaluation."
 	echo ""
-	echo "Outputs"
-	echo "-----------------------------"
-	echo " Results<outputDir/fileListName>/"
-	echo "    ConfusionMatrices.*:    confusion matrix spreadsheet (errors in csv/html)"
-	echo "    FileMetrics.csv:        file metrics spreadsheet"
-	echo "    Summary.txt:            summary of performance metrics"
-	echo "    labelsGT.txt:           list of node and edge labels in ground truth"
-	echo "    labelsOutput.txt:       list of node and edge labels in output files"
-	echo "" 
-	echo "    Metrics/: directory with .csv (metric) and .diff (difference) files"
-	echo "    graphErrors/: if dot output requested, visualizations for files with"
-	echo -e "\t\t  errors are stored here (.dot and .pdf[default] or .png or both as specified)."
-	echo ""
-	echo "NOTE: the different visualizations of structural differences are described"
-	echo "      if you run lg2dot without arguments (object (t)ree; (d)irected graph"
-	echo "      over objects; primitive (s)egmentation graph; (b)ipartite graph over"
-	echo "      primitives; (p): default directed graph over primitives."
+	echo "The final optional arguments define the graph type to use in visualizing"
+	echo "errors, and their output format. Run 'lg2dot' for more on graph types."
 	exit 0
 fi

@@ -51,17 +35,39 @@ BNAME=`basename $1`
 MODE="Dir"
 FORMAT="pdf"
 TARGETS=""
+TARGET_COUNT=0
 OUTPUTS=""
+NL=$'\n'
+
+OUTCOME_LIST=""
+ResultsDir=Results_$BNAME
+MULTI_PASS_WARN=0
+
+################################################################
+# Compile the list of output files and ground truth files.
+#
+# NOTE: Ground truth files define the evaluation set, extra 
+# output files are ignored.
+################################################################
+
+echo ""
+echo "[ LgEval evaluate ]"
+echo ""
+
+# Case 1: Passed a list of file pairs
 if ! [ -d $1 ]
 then
+	MODE="List"
+
 	LABEL_STRING="List File: $1"
 	echo "$LABEL_STRING"
-	MODE="List"
+
 	# Get the targets
 	OUTPUTS=`awk '{ print $1; }' $1`
 	OUTARR=($OUTPUTS)
 	TARGETS=`awk '{ print $2; }' $1`

+	# Grab additional flags
 	if [ $# -gt 1 ]
 	then
 		DOTARG=$2
@@ -70,35 +76,47 @@ then
 	then
 		FORMAT=$3
 	fi
+
+# Case 2: Passed a pair of directories
 else
-	# Peculiar '$<string>' styntax is to preserve the newline.
 	OUT_STRING="Output File Directory:  $1"
 	GT_STRING="Ground Truth Directory: $2"
+	# Peculiar '$<string>' syntax is to preserve the newline.
 	LABEL_STRING=$(printf '%s\n%s' "$OUT_STRING" "$GT_STRING")
 	echo "$LABEL_STRING"

 	OUTPUTS=`ls $1/*.lg`
 	TARGETS=`ls $2/*.lg`

+	# Grab additional flags
 	if [ $# -gt 2 ]
 	then
 		DOTARG=$3
 	fi
+
+	# RZ: Debug -- output type ignored
+	if [ $# -gt 3 ]
+	then
+		FORMAT=$4
+	fi
 fi
-echo ""
+echo "* LgEval Results Directory: $ResultsDir"

-ResultsDir=Results_$BNAME
+TARGET_COUNT=$((`echo $TARGETS | wc -w`))
+
+
+################################################################
+# Create output directory structure, compile class labels
+################################################################
 if ! [ -d $ResultsDir ]
 then
 	mkdir $ResultsDir
 	mkdir $ResultsDir/Metrics

+	# Create directories for dot error visualizations
 	if [ "$DOTARG" != "" ]
 	then
-		if [ $# -gt 3 ]
-		then
-			FORMAT=$4
-		fi
+		# RZ Debug: deleting FORMAT assignment (done above)
 		mkdir $ResultsDir/errorGraphs
 		mkdir $ResultsDir/errorGraphs/dot
 		if [ "$FORMAT" == "pdf" ]; then
@@ -112,20 +130,24 @@ then
 	fi
 fi

+# Compile labels from ground truth. This is needed for confusion matrices to
+# be properly defined, and for sanity checking results.
+echo "$TARGETS" > $ResultsDir/temp_file_list
+python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/temp_file_list" > "$ResultsDir/labelsGT.txt"
+echo "$OUTPUTS" > $ResultsDir/temp_file_list
+python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/temp_file_list" > "$ResultsDir/labelsOutput.txt"
+rm $ResultsDir/temp_file_list

-# Compute all .csv metrics outputs (per-file), and .diff results (per-file).
-echo "Evaluating files..."

-# Compile labels from ground truth. This is needed for confusion matrices to
-# be properly defined.
-echo "$TARGETS" > $ResultsDir/tfileTarget
-python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/tfileTarget" > "$ResultsDir/labelsGT.txt"
+################################################################
+# Evaluate files
+################################################################

-# Do the same for outputs. This can come handy in many places.
-echo "$OUTPUTS" > $ResultsDir/tfileTarget
-python3 $LgEvalDir/src/compileLabels.py "$ResultsDir/tfileTarget" > "$ResultsDir/labelsOutput.txt"
-rm $ResultsDir/tfileTarget
+# Compute all .csv metrics outputs (per-file), and .diff results (per-file).
+echo ""
+echo "Evaluating..."

+# Iterate over ground truth files
 INDEX=0
 for file in $TARGETS
 do
@@ -143,14 +165,25 @@ do
 	then
 		# NOTE: the script convertCrohmeLg can be used to convert
 		#       crohme .inkml files to .lg files.
-		echo "  >> Comparing $FNAME.lg"
-
-		python3 $LgEvalDir/src/evallg.py $nextFile $file m INTER > $ResultsDir/Metrics/$FNAME.csv
-		DIFF=`python3 $LgEvalDir/src/evallg.py $nextFile $file diff INTER`
 		CORRECT="Correct"
+
+		# RZ: Run evaluation once vs. twice
+		OUT=`python3 $LgEvalDir/src/evallg.py $nextFile $file INTER`
+
+		# Match asterisk at beginning of line to select differences/errors
+		# WARNING: Double quotes are important to preserve newlines!
+		# Only create DIFF files for differences.
+		DIFF=`echo "$OUT" | grep "\*"`
 		if [ -n "$DIFF" ]
 		then
-			echo "$DIFF" > $ResultsDir/Metrics/$FNAME.diff 
+			echo "$DIFF" > $ResultsDir/Metrics/$FNAME.diff
+		fi
+		echo "$OUT" | grep -v "\*" > $ResultsDir/Metrics/$FNAME.csv
+
+		# If differences reported, record files with errors, generate visualizations
+		if [ "$DIFF" != "" ]
+		then
+			CORRECT="Incorrect"

 			# If a third argument is provided, generate a .pdf file to visualize
 			# differences between graphs.
@@ -162,6 +195,7 @@ do
 				else
 					lg2dot $nextFile $file --graph_type "$DOTARG" --format $FORMAT
 				fi
+
 				mv $FNAME.dot $ResultsDir/errorGraphs/dot
 				if [ "$FORMAT" == "pdf" ]; then
 					mv $FNAME.pdf $ResultsDir/errorGraphs/pdf
@@ -172,24 +206,50 @@ do
 					mv $FNAME.png $ResultsDir/errorGraphs/png
 				fi
 			fi
-			
-			CORRECT="Incorrect"
-		else
-			rm -f $ResultsDir/Metrics/$FNAME.diff
 		fi

-		# Add record of evaluating the file.
-		echo "$nextFile, $CORRECT" >> $ResultsDir/FileResults.csv
+		# Record whether file was correct or not.
+		if [ $((INDEX)) == 0 ]
+		then
+			OUTCOME_LIST="$nextFile, $CORRECT"
+		else
+			OUTCOME_LIST=`printf "%s\n%s" "$OUTCOME_LIST" "$nextFile, $CORRECT"`
+		fi
 	else
-		echo "    Already processed: $file"
+		if [ $((MULTI_PASS_WARN)) == 0 ]
+		then
+			echo "  * Already processed: $file"
+			echo "    (message suppressed for other files)"
+			MULTI_PASS_WARN=1
+		fi
 	fi

 	INDEX=$((INDEX+1))
+	PERCENT=`echo "scale=1; 100 * $INDEX / $TARGET_COUNT" | bc`
+	
+	if [ $((`expr $INDEX % 1`)) == 0 ]
+	then
+		echo -ne "  $PERCENT% complete ($INDEX of $TARGET_COUNT)\r"
+	fi
 done
+echo -ne "  $PERCENT% complete ($INDEX of $TARGET_COUNT)\r"
+

-# Compile all metrics/diffs,
-# and then compute metric summaries and confusion matrices.
+################################################################
+# Compile metrics 
+# Including summaries and confusion matrices
+#
+# Stored as individual files to prevent re-computation for user
+################################################################
+
+if [ -n "$OUTCOME_LIST" ]
+then
+	# Need to avoid adding empty entries in Correct.csv, and sort by filename
+	echo "$OUTCOME_LIST" >> $ResultsDir/Correct.csv
+	sort -o $ResultsDir/Correct.csv $ResultsDir/Correct.csv
+fi
 cat $ResultsDir/Metrics/*.csv > $ResultsDir/$BNAME.csv
+
 ALLDIFFS=`ls $ResultsDir/Metrics | grep .diff`
 if [ -n "$ALLDIFFS" ]
 then
@@ -199,34 +259,58 @@ else
 	touch $ResultsDir/$BNAME.diff  # empty - no errors.
 fi

-python3 $LgEvalDir/src/sumMetric.py "$LABEL_STRING" $ResultsDir/$BNAME.csv > $ResultsDir/Summary.txt
-python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt html > $ResultsDir/ConfusionMatrices.html
-python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt  > $ResultsDir/ConfusionMatrices.csv
+# Compute summaries 
+python3 $LgEvalDir/src/sumMetric.py "$LABEL_STRING" $ResultsDir/$BNAME.csv > \
+	$ResultsDir/Summary.txt
+python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt html > \
+	$ResultsDir/ConfusionMatrices.html
+python3 $LgEvalDir/src/sumDiff.py $ResultsDir/$BNAME.diff $ResultsDir/labelsGT.txt  > \
+	$ResultsDir/ConfusionMatrices.csv

-# RZ Oct. 2014: Create spreadsheet pairing file names with metrics.
-# Clean up raw metric data to make the file smaller and simpler.
+
+################################################################
+# Create FileMetrics.csv and summary spreadsheet
+################################################################
 # Use awk and head to select every odd (headers) and even (data) columns,
 # Concatenate one header row with data contents.
 awk -F',' '{ for (i=1;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Headers.csv
+awk -F',' '{ for (i=2;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Data.csv

 # Obtain first row for data labels; insert a "File" label in the first column.
 head -n 1 $ResultsDir/Headers.csv > $ResultsDir/HeaderRow.csv
 HEAD=`cat $ResultsDir/HeaderRow.csv`
 echo "File,Result,$HEAD" > $ResultsDir/HeaderRow.csv

-awk -F',' '{ for (i=2;i<=NF;i+=2) printf ("%s%c", $i, i + 2 <= NF ? "," : "\n")}' $ResultsDir/$BNAME.csv > $ResultsDir/Data.csv
-
 # Combine file names with raw data metrics, then add header labels.
-paste -d , $ResultsDir/FileResults.csv $ResultsDir/Data.csv > $ResultsDir/DataNew.csv
+paste -d , $ResultsDir/Correct.csv $ResultsDir/Data.csv > $ResultsDir/DataNew.csv
 cat $ResultsDir/HeaderRow.csv $ResultsDir/DataNew.csv > $ResultsDir/FileMetrics.csv

-# Clean up
-rm -f $ResultsDir/Headers.csv $ResultsDir/HeaderRow.csv $ResultsDir/Data.csv
-rm -f $ResultsDir/DataNew.csv $ResultsDir/FileResults.csv

-# Remove the compiled metrics and differences, but leave the individual metric/diff
-# files in Metrics to support debugging for malformed or missing files, etc.
+##################################
+# Clean up 
+##################################
+rm -f $ResultsDir/Headers.csv $ResultsDir/HeaderRow.csv $ResultsDir/Data.csv
+rm -f $ResultsDir/DataNew.csv 
+# RZ: not deleting Correct.csv, to insure that all files are present.
+#rm -f $ResultsDir/Correct.csv
 rm -f $ResultsDir/$BNAME.csv $ResultsDir/$BNAME.diff

+
+##################################
+# Remind user of outputs
+##################################
+echo ""
 echo "done."
+echo ""
+echo "$ResultsDir/ contents:"
+echo "   Summary.txt     --   Readable metrics summary"
+echo "   Correct.csv     --   Records which files are correct/incorrect"
+echo "   graphErrors/    --   Error graph visualizations (if requested)"
+echo "   labelsOut.txt   --   Node & edge labels in output files"
+echo "   labelsGT.txt    --   Node & edge labels in ground truth files"
+echo "   FileMetrics.csv --   Raw metrics file"
+echo "   ConfusionMatrices.html  -- Readable web page with confusion matrices (HTML)"
+echo "   ConfusionMatrices.csv   -- Confusion matrix (CSV format)"
+echo "   Metrics/                -- Individual file metrics (.csv) & differences (.diff)"
+echo ""

--- a/src/confHists.py
+++ b/src/confHists.py
@@ -37,6 +37,7 @@ def main(
    pdf_count = 0
    if os.path.exists(dotpdf_dir):
        pdf_count = len(glob(os.path.join(dotpdf_dir, "*.pdf")))
+
    if pdf_count == 0:
        dotpdf_dir = "confHist_outputs/dotpdfs"
        print(
@@ -46,6 +47,7 @@ def main(
        )
        if not os.path.exists(dotpdf_dir):
            os.makedirs(dotpdf_dir)
+
    for row in fileReader:
        # Skip comments and empty lines.
        if not row == [] and not row[0].strip()[0] == "#":
@@ -128,7 +130,7 @@ def main(
        '<script src="https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/1.0.0/FileSaver.min.js"></script>\n'
    )

-    # (Excuse the messs..) create callbacks for checkbox events, save button
+    # (Excuse the mess..) create callbacks for checkbox events, save button
    # which saves the unique list of selected files in sorted order.
    # This was a slow, painful way to do this - perhaps an 'include' would be better.
    JS_DIR = os.path.join(fileList_head, "js")
@@ -362,10 +364,7 @@ def parse_args():

 if __name__ == "__main__":
    args = parse_args()
-    # print(args)
-    # img_dir = '../../../../data/test2019_inkml2img'  # 3branch...
-    # img_dir = '../../../../data/infty/IMG'   # infty_contour...
-    # img_dir = '../../../Data/Expressions/IMG'   # lpga_rf...
+    
    img_dir = os.path.join("../..", args.lgimgDir)
    main(
        args.fileList,

--- a/src/lg.py
+++ b/src/lg.py
@@ -144,7 +144,7 @@ class Lg(object):
                except:
                    # Create an empty graph if a file cannot be found.
                    # Set the error flag.
-                    sys.stderr.write(str(sys.exc_info()[0]))
+                    #sys.stderr.write(str(sys.exc_info()[0]))
                    sys.stderr.write("  !! IO Error (cannot open): " + fileName + "\n")
                    self.error = True
                    return
@@ -1298,15 +1298,15 @@ class Lg(object):

        # WARN about absent nodes/edges; indicate that there is an error.
        if len(self.absentNodes) > 0:
-            sys.stderr.write(
-                "  !! Inserting ABSENT nodes for:\n      "
-                + self.file
-                + " vs.\n      "
-                + lg2.file
-                + "\n      "
-                + str(sorted(list(self.absentNodes)))
-                + "\n"
-            )
+            #sys.stderr.write(
+            #    "  !! Inserting ABSENT nodes for "
+            #    + self.file
+            #    + " vs. "
+            #    + lg2.file
+            #    + "\n"
+            #    + str(sorted(list(self.absentNodes)))
+            #    + "\n"
+            #)
            self.error = True

        # Add "absent" nodes.