Commit 43a14840 authored by Antoine PIGEAU's avatar Antoine PIGEAU
Browse files

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	source/main/ocr/classificationExperimentMerged.py
#	source/main/oulad/classificationExperimentMerged.py
parents 40e202f8 b983ea7e
......@@ -39,3 +39,7 @@ class Constant:
PCA = "PCA"
ALL_MODELS = [AdaBoost, LogisticRegression, RandomForest, SVM, Perceptron, LstmCategorical]
FORWARD = 'forward'
BACKWARD = 'backward'
\ No newline at end of file
......@@ -38,7 +38,7 @@ from sklearn.metrics import auc
from featureManager.normalize import Normalize
from exportManager import exportLatex
from featureManager.featureMultiGroup import FeatureMultiGroup
from classifierManager.constant import Constant
class ScriptClassifier:
'''
......@@ -261,11 +261,11 @@ class ScriptClassifier:
if os.path.exists(fileNameSavedResult) and cache:
with open(fileNameSavedResult, "rb") as fileResult:
(accuracy, confusionMatrice, dictResultWeight, auc) = pickle.load(fileResult)
(accuracy, confusionMatrice, dictResultWeight, auc, stdAcurracy, stdAuc) = pickle.load(fileResult)
print(str(fileNameSavedResult)+" is already done")
return (accuracy, confusionMatrice, dictResultWeight, auc)
return (accuracy, confusionMatrice, dictResultWeight, auc, stdAcurracy, stdAuc)
#self.classifier.testIdCourse = testIdCourse
......@@ -291,12 +291,14 @@ class ScriptClassifier:
confusionMatrice = confusionMatrice / float(ntime)
s = np.sum(confusionMatrice, axis=0)
confusionMatrice = confusionMatrice / s
stdAccuracy = np.std(accuracys)
stdDAuc = np.std(aucScores)
if cache:
with open(fileNameSavedResult, "wb") as fileResult:
pickle.dump((np.average(accuracys), confusionMatrice, dictResultWeight, np.nanmean(aucScores)), fileResult)
pickle.dump((np.average(accuracys), confusionMatrice, dictResultWeight, np.nanmean(aucScores), stdAccuracy, stdDAuc), fileResult)
return (np.average(accuracys), confusionMatrice, dictResultWeight, np.nanmean(aucScores))
return (np.average(accuracys), confusionMatrice, dictResultWeight, np.nanmean(aucScores), stdAccuracy, stdDAuc)
def setFeatures(self, course=None, whereToCut=None):
......@@ -333,7 +335,7 @@ class ScriptClassifier:
self.setFeatures(course, whereToCut)
(accuracy, confusionMatrix, dictWeight, aucScore) = self.predictionTaskNTimes(
(accuracy, confusionMatrix, dictWeight, aucScore, _, _) = self.predictionTaskNTimes(
course,
whereToCut=whereToCut,
ntime=ntime)
......@@ -365,7 +367,9 @@ class ScriptClassifier:
scoreFinal = np.average(accuracies)
stdAccuracy = np.std(accuracies)
aucScoreFinal = np.nanmean(aucScores)
stdAuc = np.std(aucScores)
fileResult.write("\n final Accuracy("+str(scoreFinal)+")")
fileResult.write("\n final AUC ("+str(aucScoreFinal)+") \n")
......@@ -392,7 +396,7 @@ class ScriptClassifier:
fileResult.write(str(resultWeigth))
return (accuracies, accuracysPerClass, aucScores)
return (accuracies, accuracysPerClass, aucScores, stdAccuracy, stdAuc)
def predictionTaskForAllPeriods(self, ntime = 10):
......@@ -406,7 +410,7 @@ class ScriptClassifier:
for t in self.classifier.whereToCuts:
accuracies, accuracysPerClass, rocScores = self.predictionTaskForAllCourses(t, ntime)
accuracies, accuracysPerClass, aucScores, stdAccuracy, stdAuc = self.predictionTaskForAllCourses(t, ntime)
for i, idCourse in enumerate(self.classifier.getIdCourses()): #
......@@ -415,7 +419,7 @@ class ScriptClassifier:
if not scoresCourse:
dictResult[idCourse] = scoresCourse
scoresCourse.append((accuracies[i], accuracysPerClass[i], rocScores[i]))
scoresCourse.append((accuracies[i], accuracysPerClass[i], aucScores[i]))
fileNameSavedResult = os.path.join(self.directoryExperiment,
self.fileName+
......@@ -433,7 +437,7 @@ class ScriptClassifier:
self.classifier.getFeatures(),
self.classifier.getWhereToCuts,
self.classifier.getWhereToCutUnity(),
groups), fileResult)
groups, stdAccuracy, stdAuc), fileResult)
fileNameResultAccuracy = os.path.join(self.directoryExperiment,
"Latex"+
......@@ -627,6 +631,190 @@ class ScriptClassifier:
exportLatex.exportResultAccuracy(fileNameResultAccuracy, dictResult, self.classifier.getCourses())
exportLatex.exportResultAUC(fileNameResultAUC, dictResult, self.classifier.getCourses())
def getBestFeaturesAllCourses(self,
method,
features=None,
whereToCut=None,
dictBestFeature=None):
'''
@param dictBestFeature: dictionary that contains all the best features for all the courses on each period
Key is the course id and value is a dictionary [idCourse] = [list of best features]
'''
if features is None:
features = self.classifier.features.copy()
dictCourses = {}
for idCourse in self.classifier.courses:
if(method == Constant.BACKWARD):
(featuresBest, _) = self.getBackwardBestFeaturesCourse(idCourse,
features,
whereToCut=whereToCut,
dictBestFeature=dictBestFeature)
elif(method == Constant.FORWARD):
(featuresBest, _) = self.getForwardBestFeaturesCourse(idCourse,
features,
whereToCut=whereToCut,
dictBestFeature=dictBestFeature)
dictCourses[idCourse] = featuresBest
finalBestFeature = {}
print(dictCourses)
for feature in features:
for idCourse in dictCourses.keys():
actualScore = finalBestFeature.get(feature, 0)
try:
index = dictCourses[idCourse].index(feature)
if method == 'forward':
score = len(features) - index
elif method == 'backward':
score = 1
else:
raise "classifierManager.script.scriptSvm - getBestFeaturesAllCourses()"
except ValueError:
score = 0
finalBestFeature[feature] = actualScore + score
fileNameResult = os.path.join(self.directoryExperiment,
self.fileName+'OrderedFeaturesAll'+
'CourseSession'+str(whereToCut)+
'Method='+str(method)+
'.txt')
with open(fileNameResult, 'w') as fileResult:
resultWeigth = sorted(finalBestFeature.items(), key=operator.itemgetter(1), reverse=True)
fileResult.write(str(resultWeigth))
self.classifier.features = features.copy()
return dictBestFeature
def getBestFeaturesAllPeriods(self, method):
#sessions = [25, 50, 75, None]
sessions = [None, 75, 50, 25]
features = self.classifier.features.copy()
for t in sessions:
self.getBestFeaturesAllCourses(features,
method=method,
whereToCut=t)
self.classifier.features = features.copy()
def getForwardBestFeaturesCourse(self,
course,
features,
whereToCut=None,
dictBestFeature=None):
'''
forward method for the feature selection
@return: (features, score) where feature is the list of the 'best' features and score the score obtained for this
set of features
'''
scores = []
featureBase = []
featureBest = []
featureSelected = []
nameCourse = course.getName()
ntime = 10
fileNameSavedResult = os.path.join(self.directoryExperiment,
'SavedForwardOrderedFeaturesCourse'+str(nameCourse)+
'Session'+str(whereToCut)+'.p')
if os.path.exists(fileNameSavedResult):
with open(fileNameSavedResult, "rb") as fileResult:
(featureBase, scores) = pickle.load(fileResult)
else:
while len(featureBase) < len(features) :
scoreMax = 0.0
featureMax = None
for feature in features:
if feature not in featureBase:
featureCurrent = list(featureBase)
featureCurrent.append(feature)
print("session/period="+str(whereToCut))
print("featureBase: "+str(featureBase))
print("featuresAll: "+str(features))
print("feature tested: "+str(feature))
self.classifier.features = featureCurrent
(accuracy, _, _, auc) = self.predictionTaskNTimes(course,
whereToCut,
ntime=10,
cache=True)
score = auc
if score >= scoreMax:
scoreMax = score
featureMax = feature
''' stop the process if the accuracy is not improved'''
if scores :
if scores[-1] >= scoreMax:# and len(featureBase)>5:
break
if featureMax is not None and featureMax not in featureBase:
featureBase.append(featureMax)
scores.append(scoreMax)
featureSelected.append(featureMax)
featureBest.append(featureBase.copy())
print(("featureBase: "+str(featureBase)))
print(("featuresAll: "+str(features)))
print(("scores: "+str(scores)))
with open(fileNameSavedResult, "wb") as fileResult:
pickle.dump((featureBase, scores), fileResult)
print(("Ordered Feature:"+str(featureBase)))
self.writeResultBestFeatureCourse('Forward',
scores,
featureBest,
featureSelected,
nameCourse,
whereToCut,
ntime)
if dictBestFeature is not None:
self.populateDictBestFeaturesPerPeriod(course,
dictBestFeature,
scores,
featureBest,
whereToCut)
# def launchTestFeatures(self, period=None):
#
# featuresAll = [0]#FeatureSequence.FEATURES
......
......@@ -40,6 +40,7 @@ from project.projectParameters import ProjectParameters
from classifierManager.model.svm import Svm
from classifierManager.script.scriptClassifier import ScriptClassifier
from classifierManager.constant import Constant
class ScriptSvm(ScriptClassifier):
'''
......@@ -71,10 +72,10 @@ class ScriptSvm(ScriptClassifier):
def getForwardBestFeaturesCourse(self,
course,
groups,
features,
whereToCut=None,
dictBestFeature=None):
dictBestFeature=None,
cache = False):
'''
forward method for the feature selection
......@@ -95,7 +96,7 @@ class ScriptSvm(ScriptClassifier):
'SavedForwardOrderedFeaturesCourse'+str(nameCourse)+
'Session'+str(whereToCut)+'.p')
if os.path.exists(fileNameSavedResult):
if os.path.exists(fileNameSavedResult) and cache:
with open(fileNameSavedResult, "rb") as fileResult:
(featureBase, scores) = pickle.load(fileResult)
......@@ -118,10 +119,9 @@ class ScriptSvm(ScriptClassifier):
self.classifier.features = featureCurrent
(accuracy, _, _, auc) = self.predictionTaskNTimes(course,
groups,
(_, _, _, auc) = self.predictionTaskNTimes(course,
whereToCut,
ntime=10,
ntime=ntime,
cache=True)
score = auc
......@@ -161,10 +161,15 @@ class ScriptSvm(ScriptClassifier):
if dictBestFeature is not None:
self.populateDictBestFeaturesPerPeriod(course,
dictBestFeature,
scores,
featureBest,
whereToCut)
dictBestFeature,
scores,
featureBest,
whereToCut)
indexMax = scores.index(max(scores))
featureMax = featureBest[indexMax]
return featureMax
def getBackwardBestFeaturesCourse(self,
course,
......@@ -267,10 +272,12 @@ class ScriptSvm(ScriptClassifier):
if dictBestFeature is not None:
self.populateDictBestFeaturesPerPeriod(course,
dictBestFeature,
scores,
featureBest,
whereToCut)
dictBestFeature,
scores,
featureBest,
whereToCut)
return featureBest
def getMergedBackwardBestFeaturesCourse(self,
......@@ -508,7 +515,6 @@ class ScriptSvm(ScriptClassifier):
scoreMax = scores[indexMax]
fileNameResult = os.path.join(self.directoryExperiment,
'merged'+
backOrForward+
'OrderedFeaturesCourse'+
str(nameCourse)+
......@@ -528,9 +534,9 @@ class ScriptSvm(ScriptClassifier):
fileResult.write("Score: "+str(scoreMax) + "\n")
def getBestFeaturesAllCourses(self,
method,
features=None,
whereToCut=None,
method='backward',
whereToCut=None,
dictBestFeature=None):
'''
......@@ -544,12 +550,22 @@ class ScriptSvm(ScriptClassifier):
dictCourses = {}
for idCourse in self.classifier.courses:
(featuresBest, _) = self.getBackwardBestFeaturesCourse(idCourse,
features,
whereToCut=whereToCut,
dictBestFeature=dictBestFeature)
dictCourses[idCourse] = featuresBest
for course in self.classifier.courses:
if(method == Constant.BACKWARD):
featuresMax = self.getBackwardBestFeaturesCourse(course,
features,
whereToCut=whereToCut,
dictBestFeature=dictBestFeature)
elif(method == Constant.FORWARD):
featuresMax = self.getForwardBestFeaturesCourse(course,
features,
whereToCut=whereToCut,
dictBestFeature=dictBestFeature)
dictCourses[course.getCourseId()] = featuresMax
finalBestFeature = {}
......@@ -590,13 +606,15 @@ class ScriptSvm(ScriptClassifier):
return dictBestFeature
def getBestFeaturesAllPeriods(self):
def getBestFeaturesAllPeriods(self, method):
#sessions = [25, 50, 75, None]
sessions = [None, 75, 50, 25]
features = self.classifier.features.copy()
for t in sessions:
self.getBestFeaturesAllCourses(features, whereToCut=t)
self.getBestFeaturesAllCourses(features,
method=method,
whereToCut=t)
self.classifier.features = features.copy()
......@@ -24,17 +24,34 @@ import abc
from exportManager.export import Export
class ExportBestFeature(Export):
'''
Class to generate the code with the best features of a classification algorithm.
'''
def __init__(self,
pathDirectory,
fileName,
dictBestFeature
dictBestFeature,
dictNameAttribute
):
'''
@param pathDirectory: directory path to save the file (string)
@param fileName: name of the file to save the code (string)
@param dictBestFeature: dictionary with the id course as id, values
are a dictionary with id (25, 50, 75, 100)
and values a list of features
@param dictNameAttribute: dictionary with the name of the attribute
- id are idCourses and values the attribute name (from model.constant)
'''
super().__init__(pathDirectory, fileName)
self.dictBestFeature = dictBestFeature
self.dictNameAttribute = dictNameAttribute
def export(self):
......@@ -45,7 +62,10 @@ class ExportBestFeature(Export):
for idCourse in self.dictBestFeature.keys():
strIdCourse = idCourse.replace('-', '_')
if(self.dictNameAttribute is None):
strIdCourse = idCourse.replace('-', '_')
else:
strIdCourse = self.dictNameAttribute[idCourse]
dictOneCourse = self.dictBestFeature[idCourse]
......@@ -66,11 +86,22 @@ class ExportBestFeature(Export):
s3= "BEST_LR_FEATURE_ALL_COURSES = { "
for idCourse in list(self.dictBestFeature.keys())[0:-1]:
strIdCourse = idCourse.replace('-', '_')
if(self.dictNameAttribute is None):
strIdCourse = idCourse.replace('-', '_')
else:
strIdCourse = self.dictNameAttribute[idCourse]
s3 += "ConstantModel."+strIdCourse+": "+BEST_LR_FEATURE+strIdCourse+ALL_PERIODS+","
idCourse = list(self.dictBestFeature.keys())[-1]
strIdCourse = idCourse.replace('-', '_')
if(self.dictNameAttribute is None):
strIdCourse = idCourse.replace('-', '_')
else:
strIdCourse = self.dictNameAttribute[idCourse]
s3 += "ConstantModel."+strIdCourse+": "+BEST_LR_FEATURE+strIdCourse+ALL_PERIODS+"}"
fileResult.write(s3)
......
......@@ -61,25 +61,35 @@ class Constant(ConstantInterface):
IMPLEMENTED_FEATURES_BEST_LR_25_EDM = [4, 7, 8, 10, 11]
IMPLEMENTED_FEATURES_BEST_LR_100 = [7, 6, 12, 28, 8, 10, 14, 11, 27, 30, 9, 5]
IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_100 = [7, 6, 12, 28, 8, 10, 14, 11, 27, 30, 9, 5]
IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_75 = [7, 14, 30, 32, 9, 27]
IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_50 = [27, 7, 28]
IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_25 = [7, 14, 19, 22, 30, 35, 9, 13, 23, 36, 4, 12, 15, 18, 20, 21, 28]
IMPLEMENTED_FEATURES_BEST_LR_75 = [7, 14, 30, 32, 9, 27]
IMPLEMENTED_FEATURES_BACKWARD_BEST_LR = { 25 : IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_25,
50 : IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_50,
75 : IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_75,
100 : IMPLEMENTED_FEATURES_BACKWARD_BEST_LR_100}
IMPLEMENTED_FEATURES_BEST_LR_50 = [27, 7, 28]
IMPLEMENTED_FEATURES_BEST_BACKWARD_LR_MERGED_ALL_COURSES_25 = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36]
IMPLEMENTED_FEATURES_BEST_LR_25 = [7, 14, 19, 22, 30, 35, 9, 13, 23, 36, 4, 12, 15, 18, 20, 21, 28]
IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_25 = [30, 35, 34, 11, 33]
IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_50 = [27, 32, 9, 22]
IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_75 = [27, 28, 32, 9, 10, 35, 5, 15]
IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_100 = [27, 1, 7, 23]
IMPLEMENTED_FEATURES_FORWARD_BEST_MERGED_LR = { 25 : IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_25,
50 : IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_50,
75 : IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_75,
100 : IMPLEMENTED_FEATURES_BEST_FORWARD_LR_MERGED_ALL_COURSES_100}
IMPLEMENTED_FEATURES_BEST_LR = { 25 : IMPLEMENTED_FEATURES_BEST_LR_25,
50 : IMPLEMENTED_FEATURES_BEST_LR_50,
75 : IMPLEMENTED_FEATURES_BEST_LR_75,
100 : IMPLEMENTED_FEATURES_BEST_LR_100}
IMPLEMENTED_FEATURES_BEST_LR_MERGED_ALL_COURSES_25 = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36]
'''
Best features for each course independently
'''
'''
BEST_LR_FEATURE_ARDUINO_25 = [0, 1, 7, 9, 12, 14, 15, 16, 17, 19, 20, 22, 26, 30, 32, 35, 36]
BEST_LR_FEATURE_ARDUINO_50 = [27]
BEST_LR_FEATURE_ARDUINO_75 = [9, 10, 14, 27, 28, 30]
......@@ -213,6 +223,94 @@ class Constant(ConstantInterface):
ConstantModel.COURSE_ID_TWITTER: BEST_LR_FEATURE_TWITTER_ALL_PERIODS,
ConstantModel.COURSE_ID_WEB: BEST_LR_FEATURE_WEB_ALL_PERIODS,
ConstantModel.COURSE_ID_XML: BEST_LR_FEATURE_XML_ALL_PERIODS}
'''
BEST_LR_FEATURE_COURSE_ID_JAVA_25 = [27, 37, 3]
BEST_LR_FEATURE_COURSE_ID_JAVA_50 = [27, 22, 34, 8, 18, 3]
BEST_LR_FEATURE_COURSE_ID_JAVA_75 = [27, 13, 19]
BEST_LR_FEATURE_COURSE_ID_JAVA_100 = [7]
BEST_LR_FEATURE_COURSE_ID_JAVA_ALL_PERIODS = { 25:BEST_LR_FEATURE_COURSE_ID_JAVA_25, 50:BEST_LR_FEATURE_COURSE_ID_JAVA_50, 75:BEST_LR_FEATURE_COURSE_ID_JAVA_75, 100:BEST_LR_FEATURE_COURSE_ID_JAVA_100}
BEST_LR_FEATURE_COURSE_ID_XML_25 = [27, 22, 30]
BEST_LR_FEATURE_COURSE_ID_XML_50 = [27, 19]
BEST_LR_FEATURE_COURSE_ID_XML_75 = [7]
BEST_LR_FEATURE_COURSE_ID_XML_100 = [7]
BEST_LR_FEATURE_COURSE_ID_XML_ALL_PERIODS = { 25:BEST_LR_FEATURE_COURSE_ID_XML_25, 50:BEST_LR_FEATURE_COURSE_ID_XML_50, 75:BEST_LR_FEATURE_COURSE_ID_XML_75, 100:BEST_LR_FEATURE_COURSE_ID_XML_100}
BEST_LR_FEATURE_COURSE_ID_IONIC_25 = [27, 1]
BEST_LR_FEATURE_COURSE_ID_IONIC_50 = [27, 15]
BEST_LR_FEATURE_COURSE_ID_IONIC_75 = [27]
BEST_LR_FEATURE_COURSE_ID_IONIC_100 = [27]
BEST_LR_FEATURE_COURSE_ID_IONIC_ALL_PERIODS = { 25:BEST_LR_FEATURE_COURSE_ID_IONIC_25, 50:BEST_LR_FEATURE_COURSE_ID_IONIC_50, 75:BEST_LR_FEATURE_COURSE_ID_IONIC_75, 100:BEST_LR_FEATURE_COURSE_ID_IONIC_100}
BEST_LR_FEATURE_COURSE_ID_RUBY_25 = [4]
BEST_LR_FEATURE_COURSE_ID_RUBY_50 = [21]
BEST_LR_FEATURE_COURSE_ID_RUBY_75 = [7, 0, 27, 19]
BEST_LR_FEATURE_COURSE_ID_RUBY_100 = [7]
BEST_LR_FEATURE_COURSE_ID_RUBY_ALL_PERIODS = { 25:BEST_LR_FEATURE_COURSE_ID_RUBY_25, 50:BEST_LR_FEATURE_COURSE_ID_RUBY_50, 75:BEST_LR_FEATURE_COURSE_ID_RUBY_75, 100:BEST_LR_FEATURE_COURSE_ID_RUBY_100}
BEST_LR_FEATURE_COURSE_ID_NODE_JS_25 = [27, 18, 21]
BEST_LR_FEATURE_COURSE_ID_NODE_JS_50 = [27, 11, 7]
BEST_LR_FEATURE_COURSE_ID_NODE_JS_75 = [27, 7, 35, 21]
BEST_LR_FEATURE_COURSE_ID_NODE_JS_100 = [11]
BEST_LR_FEATURE_COURSE_ID_NODE_JS_ALL_PERIODS = { 25:BEST_LR_FEATURE_COURSE_ID_NODE_JS_25, 50:BEST_LR_FEATURE_COURSE_ID_NODE_JS_50, 75:BEST_LR_FEATURE_COURSE_ID_NODE_JS_75, 100:BEST_LR_FEATURE_COURSE_ID_NODE_JS_100}
BEST_LR_FEATURE_COURSE_ID_ARDUINO_25 = [27, 1