Commit ba104221 authored by Florian RICHOUX's avatar Florian RICHOUX

Merge branch 'release/tcbb'

parents 8f13a246 4121879c
This project aims at discovering unmet descriptions of Protein-Protein Interactions using a large-scale approach.
\ No newline at end of file
# What is DeepPPI?
DeepPPI is a deep learning project to predict protein-protein interactions based on protein sequence only.
It is declined into PyTorch models (deprecated) and Keras models.
# How to use our Keras models?
In the 'keras' folder, run the main.py with Python. For instance, to print helps, type
`python main.py -h`
To train a model, you must give both a training set and a validation set. For instance
`python main.py -train ../data/small_1166_train.txt -val ../data/small_1166_val.txt`
In our IEEE/ACM Transactions on Computational Biology and Bioinformatics / arXiv paper, our datasets can be found in:
..* text files in `data/mirror` for our regular dataset
..* text files in `data/mirror/double` for our strict dataset
The two models from these papers can be found at `keras/models/fc2_20_2dense.py` and `keras/models/lstm32_3conv3_2dense_shared.py`. Trained weights of these models are in `keras/weights/`.
To load weights and test a model on a test set, the command is the following:
`python main.py -load <weights_file> -test <test_file>`
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
#sys.path.append('/comptes/teletchea-s/venv_specifiques/PPIpredict/lib/python3.4/site-packages/')
#sys.path.append('/home/flo/share/Developpement/PPIpredict/lib/python3.4/')
# Utilisation de Bio.PDB
# http://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ
from Bio.PDB import *
### Specify path for aaindex init othwerwise python cmd is called...
import aaindex as aaindex
aaindex.init(os.getcwd(),index='13')
# TODO FR : normaliser les données pour les coordonnées 3D
# TODO FR : piste 1 -> normaliser en fonction de la taille de la maille cristallo : difficile car on doit tenir compte du nmbre de structures dans la maille
# TODO FR : piste 2 -> normalisation magique avec numpy -> http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.norm.html
# TODO ST : retourner un vecteur par acide aminé (entier de 0 à 19)
# TODO ST : retourner un vecteur par type d'atome (0 à 4 (ou 5 ?))
# TODO ST : vecteur interaction séparé du vecteur data
# TODO ST : meilleur méthode de séparation des acides aminés à l'interface
# voir l'option NeighborSearch ou http://www.pymolwiki.org/index.php/InterfaceResidues
# TODO ST : mieux séparer les chaines pour être sûr d'avoir deux protéines différentes, en interaction 1:1
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q',
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',
'GLY':'G', 'PRO':'P', 'CYS':'C'}
def get_residues_in_contact(model, aa):
ca1=aa['CA']
distance=11
for c in model:
if c != aa.get_parent():
for residue in c:
ca2=residue['CA']
ca2name=residue.get_resname()
distance = ca2-ca1
if distance < 10:
# print ("La distance entre {} et {} est : {}".format(c, ca2name, distance))
return ca2name, 1
else:
return ca2name, 0
def get_amino_acid_parameters(AA):
aa=aaindex.to_one_letter_code[AA]
# TODO : sélection arbitraire des sources
# bien entendu, à raffiner ...
# http://www.genome.jp/aaindex/AAindex/list_of_indices
# http://www.genome.jp/aaindex/AAindex/Appendix
parameters={}
parameters['JOND750101']='Hydrophobicity (Jones, 1975)'
parameters['KRIW790103']='Side chain volume (Krigbaum-Komoriya, 1979)'
parameters['CHOP780201']='Normalized frequency of alpha-helix (Chou-Fasman, 1978b)'
parameters['CHOP780202']='Normalized frequency of beta-sheet (Chou-Fasman, 1978b)'
parameters['DAYM780101']='Amino acid composition (Dayhoff et al., 1978b)'
parameters['CHAM820101']='Polarizability parameter (Charton-Charton, 1982)'
v=[]
for p in parameters.keys():
param=aaindex.get(p)
try:
v.append(param.get(aa))
except Exception as e:
v.append(0.0)
# print ("Le vecteur trouvé est {}".format(l))
return v
def output_old_version(structure):
vect_coords=[]
for model in structure:
for chain in model:
for residue in chain:
r = residue.get_resname()
interaction=0
s=''
# if r != 'HOH' and r != ' MG':
if r in one_letter.keys():
try:
s,interaction=get_residues_in_contact(model, residue)
# Phase 2 : uniquement AA1 AA2 contact ou pas
# vect_tout=[r,s,interaction]
# output_vecteurs(vect_tout)
except Exception as e:
# print ("L'erreur est %s" % e)
pass
vect_aa=get_amino_acid_parameters(r)
for atom in residue:
print r , atom.get_name(), atom.get_coord()
coords=atom.get_coord()
# Ne garder que le type d'atome ou son nom ?
# print ("%d %s %s %5.2f %5.2f %5.2f" % (interaction, r, atom.get_name()[0], coords[0],coords[1], coords[2]))
# Phase 3
# vect_coords=[interaction, r, atom.get_name()[0], coords[0],coords[1], coords[2]]
# vect_tout=vect_coords+vect_aa
# output_vecteurs(vect_tout)
def output_vecteurs(mon_vecteur):
print (mon_vecteur)
def main():
# p = MMCIFParser()
# structure = p.get_structure('demo', '1a2y.cif')
p=PDBParser()
structure = p.get_structure('demo', '1wq1.pdb')
# Structures à considérer pour le jeu de données initial
# - 1tv6
# - 1e27
# - 1pys
# - 1tnu (mais A,B)
# - 2cc1 (mais A,B)
# - 1m2t
# - 1wq1 (ras-GAP)
# print ("Le contenu du dictionnaire est %s" % structure.header)
# for h in structure.header:
# print ("L'entrée est %s" % h)
chain_list=[]
for c in structure.get_chains():
chain_list.append(c)
print("Il y a {} chaines dans le pdb, la première chaine est {}".format(len(chain_list),chain_list[0].get_id()))
# TODO : faire un vecteur qui donne non seulement l'interaction, mais aussi les deux atomes en interaction
# TODO : indiquer l'hybridation chimique de l'atome
# TODO : typer les atomes avec un champ de force à définir
output_old_version(structure)
if __name__ == "__main__":
main()
#!/bin/bash
pdbdir="/comptes/hoffmann-l/pdb_database"
for file in $pdbdir/*/*.gz;
do
nbuniprot=$( zgrep DBREF $file |awk '{print $7'}|sort|uniq|wc -l )
if [ $nbuniprot -eq 2 ]; then
pdb=$( basename $file )
pdb=${pdb/.ent.gz/}
pdb=${pdb/pdb/}
echo $pdb
# echo "$pdb : $nbuniprot entries"
fi
done
#! /usr/bin/python
# Copyright (c) 2003 Robert L. Campbell
from cctbx import uctbx, sgtbx
from pymol.cgo import *
from pymol import cmd
from pymol.vfont import plain
def set_to_zero(a):
if abs(a) < 1e-10:
a=0
return a
def draw_cell(obj,radius=0.2,name="cell"):
"""
From pymol issue the "run draw_cell.py" command to load the script,
then issue the "draw_cell(object,<optional radius>)" command
to actually run it and create the cgo object showing the unit cell
border for the space group specified by molecular object 'object'.
e.g. load 1avv.pdb
run draw_cell.py
draw_cell 1avv 0.5 (or draw_cell('1avv',.5))
see also help(draw_cell_param) to draw the cell border for
user-defined cell dimensions (i.e. not loaded from a pdb file)
See also "help(draw_cell_param) to draw the cell border by
specifying the unit cell parameters directly (i.e. not loaded from
a pdb file).
"""
radius=float(radius)
cell_info=cmd.get_symmetry(obj)
draw_cell_param(cell_info[0:6],radius,name)
def draw_cell_param(cell_param_list,radius=0.2,name="cell"):
"""
If you wish to draw the unit cell border for any cell without the
need to load a pdb file, then do this:
e.g. run draw_cell.py
draw_cell_param((45.2,45.2,70.8,90.,90.,120.),0.5)
to generate the cell border for this trigonal space group "p 31 2 1"
with a radius of 0.5A. Labels for the origin, and A, B and C axes
will appear as well. The perimeter of the cell is colored with the
RGB components corresponding to the A,B,C components.
"""
size = radius*15.
origin_offset = radius * -25.
U=uctbx.unit_cell((cell_param_list))
vert_000 = map(set_to_zero,U.orthogonalize((0.,0.,0)))
vert_100 = map(set_to_zero,U.orthogonalize((1.,0.,0)))
vert_010 = map(set_to_zero,U.orthogonalize((0.,1.,0)))
vert_001 = map(set_to_zero,U.orthogonalize((0.,0.,1)))
vert_110 = map(set_to_zero,U.orthogonalize((1.,1.,0)))
vert_011 = map(set_to_zero,U.orthogonalize((0.,1.,1)))
vert_101 = map(set_to_zero,U.orthogonalize((1.,0.,1)))
vert_111 = map(set_to_zero,U.orthogonalize((1.,1.,1)))
# vert_000 = map(None,U.orthogonalize((0.,0.,0)))
# vert_100 = map(None,U.orthogonalize((1.,0.,0)))
# vert_010 = map(None,U.orthogonalize((0.,1.,0)))
# vert_001 = map(None,U.orthogonalize((0.,0.,1)))
# vert_110 = map(None,U.orthogonalize((1.,1.,0)))
# vert_011 = map(None,U.orthogonalize((0.,1.,1)))
# vert_101 = map(None,U.orthogonalize((1.,0.,1)))
# vert_111 = map(None,U.orthogonalize((1.,1.,1)))
#print vert_000
#CYLINDER = ['CYLINDER']
#radius = [0.2]
#print radius
cell = []
cell.append(CYLINDER)
cell = cell + vert_000 + vert_100 + [radius] + [0,0,0] + [1,0,0]
cell.append(CYLINDER)
cell = cell + vert_000 + vert_010 + [radius] + [0,0,0] + [0,1,0]
cell.append(CYLINDER)
cell = cell + vert_000 + vert_001 + [radius] + [0,0,0] + [0,0,1]
cell.append(CYLINDER)
cell = cell + vert_100 + vert_110 + [radius] + [1,0,0] + [1,1,0]
cell.append(CYLINDER)
cell = cell + vert_100 + vert_101 + [radius] + [1,0,0] + [1,0,1]
cell.append(CYLINDER)
cell = cell + vert_010 + vert_110 + [radius] + [0,1,0] + [1,1,0]
cell.append(CYLINDER)
cell = cell + vert_010 + vert_011 + [radius] + [0,1,0] + [0,1,1]
cell.append(CYLINDER)
cell = cell + vert_001 + vert_101 + [radius] + [0,0,1] + [1,0,1]
cell.append(CYLINDER)
cell = cell + vert_001 + vert_011 + [radius] + [0,0,1] + [0,1,1]
cell.append(CYLINDER)
cell = cell + vert_110 + vert_111 + [radius] + [1,1,0] + [1,1,1]
cell.append(CYLINDER)
cell = cell + vert_101 + vert_111 + [radius] + [1,0,1] + [1,1,1]
cell.append(CYLINDER)
cell = cell + vert_011 + vert_111 + [radius] + [0,1,1] + [1,1,1]
cmd.load_cgo(cell,name)
#return cell
text = [COLOR, 1.0, 0.0, 1.0,]
cyl_text(text,plain,[origin_offset,origin_offset,-1],'Origin',radius,axes=[[size,0.0,0.0],[0.0,size,0.0],[0.0,0.0,size]],color=[1.0,0.0,1.0])
cyl_text(text,plain,map(None,U.orthogonalize((1.05,0.0,0.0))),'A',radius,axes=[[size,0.0,0.0],[0.0,size,0.0],[0.0,0.0,size]],color=[1.0,0.0,0.0])
cyl_text(text,plain,map(None,U.orthogonalize((0.0,1.05,0.0))),'B',radius,axes=[[size,0.0,0.0],[0.0,size,0.0],[0.0,0.0,size]],color=[0.0,1.0,0.0])
cyl_text(text,plain,map(None,U.orthogonalize((0.0,0.0,1.05))),'C',radius,axes=[[size,0.0,0.0],[0.0,size,0.0],[0.0,0.0,size]],color=[0.0,0.0,1.0])
cmd.load_cgo(text,name+'_labels')
cmd.extend("draw_cell",draw_cell)
This diff is collapsed.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import random
import requests
import time
import sys
from xml.etree import ElementTree as ET
from Bio import SeqIO
# Utilisation de Bio.PDB
# http://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ
#from Bio.PDB import *
def download_uniprot_file(uniprot_id):
unp_file='../uniprot/'+uniprot_id+'.xml'
if not os.path.exists(unp_file) or os.path.getsize(unp_file) == 15614:
print("Downloading ... {}".format(uniprot_id), file=sys.stderr)
unp_url = 'https://www.uniprot.org/uniprot/'+unp_file
unp = requests.get(unp_url)
with open(unp_file,'w') as f:
f.write(unp.content)
time.sleep(1)
if os.path.getsize(unp_file) == 15614:
print("Uniprot me blackliste ...", file=sys.stderr)
quit()
def get_uniprot(uniprot_id):
download_uniprot_file(uniprot_id)
o=get_uniprot_object(uniprot_id)
return o
def get_uniprot_object(uniprot_id):
records=SeqIO.parse('../uniprot/'+uniprot_id+'.xml', 'uniprot-xml')
r=list(records)[0]
return r
def get_partners(uniprot_id):
'''
Biopython does not record the binding partners from Intact keeped by Uniprot,
so we need to parse the file using ElementTree
'''
unp_file='../uniprot/'+uniprot_id+'.xml'
u=ET.parse(unp_file)
ur=u.getroot()
partners=[]
for n in ur[0]:
for m in n.iter('{http://uniprot.org/uniprot}interactant'):
for i in m.iter('{http://uniprot.org/uniprot}id'):
partners.append(i.text)
return partners
def main():
print("TODO : réfléchir au cas des homodimères ...", file=sys.stderr)
targets=['Q9Y6Q6','P10415','O14788','P04637','P06213','P04637','P63244','P10275','P08246',
'P05067','P00533','Q14524','P35555','P35222','P00451','P38398','O95273','P68871','P31749',
'Q96EB6','Q05086','Q08379','P01137','P02649','Q5S007','Q12802','P06213','P12931','Q9NYL2',
'P35498','P49768','P62258','Q04917','P27348','P30443','P04439','P01889','P30461','P30462',
'P30464','P18463','Q95365','Q04826','P30479','P30483','P30486','P30487','P30488','P18464',
'P30490','P30491','P30492','P30493','P14060','Q9H2F3','Q9Y3L3','P41595','P28335','Q96P26',
'Q9H0P0','P21589','P52209','O95336','P20848','Q4UJ75','Q5T5F5','P30542','P29275','P0DMS8',
'Q05901','Q9Y614','P11229','Q6ZNF0','Q96QF7','Q08AH3','Q9NUB1','Q4G176','Q9ULC5','Q08AH1',
'Q53FZ2','Q562R1','Q9BYX7','Q9H568','P12814','P35609','Q04771','Q03154','Q96HD9','O14672',
'O75078','P78536','Q9H013','P25100','O43506','Q9UKJ8','Q9UKF5','P08913','Q7Z6V5','Q96EY9',
'P00813','Q7Z695','Q3MIX3','Q5VUY0','O95622','P28332']
# i=0
for u in targets:
u_obj=get_uniprot(u)
partners=get_partners(u)
for p in partners:
p_obj=get_uniprot(p)
print("{} {} {} {} {}".format(u, p, u_obj.seq, p_obj.seq, 1))
# Create dummy dataset for negative interactions
# Guess up to 10 interactions from the db, according to average stats on the genome
putative_interactions=random.randint(0, 20)
putative_partners=[]
for y in xrange(0, putative_interactions):
putative_partners.append(random.randint(0,len(targets)-1))
# print("Le nombre de partenaires à tester est {} et le nombre réduit est {}".format(len(putative_partners), len(set(putative_partners))))
# set est utile car le "tirage au sort" contient souvent des doublons ...
for i in set(putative_partners):
dummy=targets[i]
t_obj=get_uniprot(dummy)
if dummy in partners:
print("{} {} {} {} {}".format(u, dummy, u_obj.seq, t_obj.seq, 1))
else:
print("{} {} {} {} {}".format(u, dummy, u_obj.seq, t_obj.seq, 0))
# except Exception as e:
# print("Erreur: {}".format(e))
# # in this case no identifier is found in the targets list, do nothing
# pass
# i=i+1
if __name__ == "__main__":
main()
import torch
from torch.autograd import Variable
# From github.com/mrdrozdov/pytorch-extras
def one_hot(size, index):
mask = torch.LongTensor(*size).fill_(0)
ones = 1
if isinstance(index, Variable):
ones = Variable(torch.LongTensor(index.size()).fill_(1))
mask = Variable(mask, volatile=index.volatile)
ret = mask.scatter_(1, index, ones)
return ret
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import random
import sys
from xml.etree import ElementTree as ET
from Bio import SeqIO
class UniprotEntry(object):
uniprotID = ""
sequence = ""
partners = ""
# The class "constructor" - It's actually an initializer
def __init__(self, uniprotID, sequence, partners):
self.uniprotID = uniprotID
self.sequence = sequence
self.partners = partners
def main():
uniprotEntries={}
et = ET.parse('uniprot-human.xml')
#et=ET.parse('mapk_Homo+sapiens107entries.xml')
etr=et.getroot()
for n in etr.iter('{http://uniprot.org/uniprot}entry'):
# Only consider the first uniprot identifier, other ones are duplicate
accession=n.find('{http://uniprot.org/uniprot}accession').text.strip()
sequence=n.find('{http://uniprot.org/uniprot}sequence').text.strip().replace('\r', '').replace('\n', '')
partners=[]
for m in n.iter('{http://uniprot.org/uniprot}interactant'):
for i in m.iter('{http://uniprot.org/uniprot}id'):
partners.append(i.text)
# Get new object for this entry
e=UniprotEntry(accession, sequence, partners)
# e.uniprotID=accession
#accession,sequence,partners)
uniprotEntries[accession]=e
uniprotList=[l for l in uniprotEntries.keys()]
print("La liste est composée de {} membres".format(len(uniprotList)),file=sys.stderr)
for k in uniprotEntries.keys():
for b in uniprotEntries[k].partners:
try:
print("{} {} {} {} 1".format(k, uniprotEntries[b].uniprotID, uniprotEntries[k].sequence, uniprotEntries[b].sequence))
except Exception as e:
print("{} -> {} no info: {}".format(k, b, e), file=sys.stderr)
# Create dummy dataset for negative interactions
# Guess up to 5 interactions from the db, according to average stats on the genome
putative_interactions=random.randint(0, 5)
putative_partners=[]
for y in xrange(0, putative_interactions):
putative_partners.append(random.randint(0,len(uniprotEntries)-1))
# print("Le nombre de partenaires à tester est {} et le nombre réduit est {}".format(len(putative_partners), len(set(putative_partners))))
# set est utile car le "tirage au sort" contient souvent des doublons ...
for i in set(putative_partners):
uid=uniprotList[i]
try:
if not uid in uniprotEntries[k].partners:
print("{} {} {} {} {}".format(uniprotEntries[k].uniprotID, uid, uniprotEntries[k].sequence, uniprotEntries[uid].sequence, 0))
except Exception as e:
print("Pour {}, l'erreur est {}".format(i,e),file=sys.stderr)
pass
if __name__ == "__main__":
main()
import sys
def usage():
print("Usage: {} FILE".format(sys.argv[0]))
sys.exit(1)
def spot( word, count ):
for letter in {'X', 'B', 'Z', 'U'}:
if letter in word:
print("{} at ({},{}) in {}".format( letter, count, word.find( letter ), word ) )
def count(data):
data = open(data, 'r')
skip = data.readline()
aa = set()
line_count = 1
for line in data.readlines():
line_count += 1
words = line.split(" ")
aa = aa | set( words[2] )
aa = aa | set( words[3] )
spot( words[2], line_count )
spot( words[3], line_count )
data.close()
return aa
if __name__ == '__main__':
if len(sys.argv) != 2:
usage()
aa = count( sys.argv[1] )
print("Set of AA: {} ({})".format( sorted(aa), len(aa) ) )
sys.exit(0)
*.aux
*.log
*~
*.toc
*#*
*.out
@book{Goodfellow-et-al-2016,
title={Deep Learning},
author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
publisher={MIT Press},
note={\url{http://www.deeplearningbook.org}},
year={2016}
}
\ No newline at end of file
This diff is collapsed.
\section{Etape 3}
Informations sur les structures de protéine 3d :
\url{https://peptone.io/dspp}
\ No newline at end of file
\section{Réseaux de neurones}
\subsection{Introduction}
Un réseau de neurones est un système dont les composants, des neurones, qui reçoivent une entrée, et y associent une sortie grâce à une fonction d'activation.
Le réseau est formé car certaines sorties de neurones sont reliées aux entrées d'autres neurones, formant un graphe orienté pondéré par des poids associés aux arêtes du graphe.
Les poids peuvent être modifiés à travers l'apprentissage.
%TODO: add a schéma + better explanation
\subsection{Fonction d'activation}
Une fonction d'activation est une fonction mathématique appliquée au signal en sortie du neurone, qui reprend l'idée du potentiel d'action présent en biologie.
Il en existe de plusieurs types, qui ont des caractéristiques et des utilisations variées.
\subsubsection*{Fonction sigmoïde}
%TODO: add schéma sigmoïde
%TODO: reword
La fonction sigmoïde présente l'intérêt d'avoir une dérivable de sa fonction inverse facile à calculer ce qui intervient dans les performances des algorithmes l'utilisant.
$$
f(x)=\frac{1}{1 + e^{- x}}
$$