Commit 8d8ae11c authored by Florian RICHOUX's avatar Florian RICHOUX

From 20 to 24 AA

parent 747bea50
import sys
def usage():
print("Usage: {} FILE".format(sys.argv[0]))
sys.exit(1)
def spot( word, count ):
for letter in {'X', 'B', 'Z', 'U'}:
if letter in word:
print("{} at ({},{}) in {}".format( letter, count, word.find( letter ), word ) )
def count(data):
data = open(data, 'r')
skip = data.readline()
aa = set()
line_count = 1
for line in data.readlines():
line_count += 1
words = line.split(" ")
aa = aa | set( words[2] )
aa = aa | set( words[3] )
spot( words[2], line_count )
spot( words[3], line_count )
data.close()
return aa
if __name__ == '__main__':
if len(sys.argv) != 2:
usage()
aa = count( sys.argv[1] )
print("Set of AA: {} ({})".format( sorted(aa), len(aa) ) )
sys.exit(0)
......@@ -4,7 +4,7 @@ from keras import utils as ut
def letter2number( letter ):
'''
The dataset has 20 amino acids, encoded with letters (every letter but B, J, O, U, X and W).
The dataset has 24 amino acids, encoded with letters (every letter but J and O).
No switch case in python, hence the ugly if elifs
'''
number = 0
......@@ -48,6 +48,15 @@ def letter2number( letter ):
number = 19
elif letter == 'Y':
number = 20
# Special letters here
elif letter == 'B':
number = 21
elif letter == 'U':
number = 22
elif letter == 'X':
number = 23
elif letter == 'Z':
number = 24
return number
......@@ -83,7 +92,7 @@ def one_hot(y, max_size, num_classes=None):
E.g. for use with categorical_crossentropy.
Arguments:
y: class vector to be converted into a matrix
(integers from 0 to num_classes).
(integers from 1 to num_classes).
num_classes: total number of classes.
Returns:
A binary matrix representation of the input.
......@@ -117,8 +126,8 @@ def load_data( file_name ):
for line in data.readlines():
one_line = line.rstrip('\n').split(' ')
# one-hotting proteins
protein1.append( one_hot( sequence2array( one_line[2] ), max_size, num_classes=20 ) )
protein2.append( one_hot( sequence2array( one_line[3] ), max_size, num_classes=20 ) )
protein1.append( one_hot( sequence2array( one_line[2] ), max_size, num_classes=24 ) )
protein2.append( one_hot( sequence2array( one_line[3] ), max_size, num_classes=24 ) )
output.append( int( one_line[4] ) )
protein1 = np.asarray( protein1 )
......
......@@ -11,14 +11,14 @@ class FC2_20_2Dense(AbstractModel):
super().__init__()
def get_model(self):
input1 = Input(shape=(1166,20,), dtype=np.float32, name='protein1')
input1 = Input(shape=(1166,24,), dtype=np.float32, name='protein1')
protein1 = layers.Flatten()(input1)
protein1 = layers.Dense(20, activation='relu')(protein1)
protein1 = layers.BatchNormalization()(protein1)
protein1 = layers.Dense(20, activation='relu')(protein1)
protein1 = layers.BatchNormalization()(protein1)
input2 = Input(shape=(1166,20,), dtype=np.float32, name='protein2')
input2 = Input(shape=(1166,24,), dtype=np.float32, name='protein2')
protein2 = layers.Flatten()(input2)
protein2 = layers.Dense(20, activation='relu')(protein2)
protein2 = layers.BatchNormalization()(protein2)
......
......@@ -22,7 +22,7 @@ class LSTM32_3Conv3_2Dense_S(AbstractModel):
batchnorm3 = layers.BatchNormalization()
lstm = layers.LSTM(32)
input1 = Input(shape=(None,20,), dtype=np.float32, name='protein1')
input1 = Input(shape=(None,24,), dtype=np.float32, name='protein1')
protein1 = conv1(input1)
protein1 = pool1(protein1)
protein1 = batchnorm1(protein1)
......@@ -34,7 +34,7 @@ class LSTM32_3Conv3_2Dense_S(AbstractModel):
protein1 = batchnorm3(protein1)
protein1 = lstm(protein1)
input2 = Input(shape=(None,20,), dtype=np.float32, name='protein2')
input2 = Input(shape=(None,24,), dtype=np.float32, name='protein2')
protein2 = conv1(input2)
protein2 = pool1(protein2)
protein2 = batchnorm1(protein2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment