Commit 72d29749 authored by Adrien Leger's avatar Adrien Leger
Browse files

Remove blast sources and start to individualize Refmasker sources (broken sources)

parent 3d63ddcd
......@@ -55,3 +55,7 @@ docs/_build/
# PyBuilder
target/
# Custom exceptions
src/RefMasker_conf_file.txt
# RefMasker
Mask homologies in fasta reference sequences based on an user provided ranking (IN DEVELOPMENT)
The order of references is CRITICAL
# if you want to perform a reference masking since it will
# start by the last reference masked by all the others then the penultimate masked by
# all others except the last and and so on until there is only one reference remaining
The order of references is CRITICAL if you want to perform a reference masking by the last reference masked by all the others then the penultimate masked by all others except the last and and so on until there is only one reference remaining
@brief **Wrapper for Blastn from NCBI Blast**
Please see the Blast+ user manual document for further details
[MANUAL](http://www.ncbi.nlm.nih.gov/books/NBK1762/)
Basically the top level function Blastn.align processes sequences as follow:
* If a blastn database is provided it will attempt to validate it by creating and MakeblastdbWrapper.ExistingDB object.
* If the validation fails of if no database was provided a new database will be created by using MakeblastdbWrapper.NewDB from a reference fasta file
* A instance of BlastnWrapper.Aligner is then created by passing the DB object as an argument.
* A list of fasta file (each can contains many sequences) is then submitted one by one to the balst database though BlastnWrapper.Aligner
* For each query fasta a list of BlastHit objects containing informations for blast hits found will be returned
* Hit lists are combined into a single flat list and returned at the end of Blastn.align execution.
@copyright [GNU General Public License v2](http://www.gnu.org/licenses/gpl-2.0.html)
@author Adrien Leger - 2014
* <adrien.leger@gmail.com>
* <adrien.leger@inserm.fr>
* <adrien.leger@univ-nantes.fr>
* [Github](https://github.com/a-slide)
* [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/)
\ No newline at end of file
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class BlastHit(object):
"""
@class BlastHit
@brief Object oriented class containing informations of one blast hit
The following instance field are accessible :
* q_id : Query sequence name
* s_id : Subject sequence name
* identity : % of identity in the hit
* length : length of the hit
* mis : Number of mismatch in the hit
* gap : Number of gap in the hit
* q_orient : Orientation of the query along the hit
* q_start : Hit start position of the query
* q_end : Hit end position of the query
* s_orient : Orientation of the subject along the hit
* s_start : Hit start position of the subject
* s_end : Hit end position of the subject
* evalue : E value of the alignement
* bscore : Bit score of the alignement
A class list is used to track all instances generated.
"""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~CLASS FIELDS~~~~~~~#
Instances = [] # Class field used for instance tracking
id_count = 0
#~~~~~~~CLASS METHODS~~~~~~~#
@ classmethod
def next_id (self):
cur_id = self.id_count
self.id_count +=1
return cur_id
@ classmethod
def count_total (self):
"""
@return Overall number of BlastHit object in Instance list
"""
return (len(self.Instances))
@ classmethod
def stat_per_ref (self):
"""
@return Number of BlastHit object in Instance list sorted by reference subject sequence
"""
d = {}
for hit in self.Instances:
if hit.s_id in d:
d[hit.s_id][0] += 1
d[hit.s_id][1] += hit.length
else:
d[hit.s_id] = [1, hit.length]
return d
@ classmethod
def get (self):
"""
@return The list of all BlastHit object generated
"""
return self.Instances
@ classmethod
def get_ref (self, ref):
"""
@param ref Name of a reference sequence in the subject database
@return The list of all BlastHit object generated for this reference
"""
return [hit for hit in self.Instances if hit.s_id == "ref"]
@ classmethod
def reset_list (self):
"""
Reset the instance tracking list (Usefull after
"""
self.Instances = []
self.id_count = 0
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
def __init__(self, q_id, s_id, identity, length, mis, gap, q_start, q_end, s_start, s_end, evalue, bscore):
"""
Create a BlastHit object which is automatically added to the class tracking instance list
The object with the following parameters are required for object initialisation
@param q_id Query sequence name
@param s_id Subject sequence name
@param identity % of identity in the hit
@param length length of the hit
@param mis Number of mismatch in the hit
@param gap Number of gap in the hit
@param q_start Hit start position of the query
@param q_end Hit end position of the query
@param s_start Hit start position of the subject
@param s_end Hit end position of the subject
@param evalue E value of the alignement
@param bscore Bit score of the alignement
"""
self.id = self.next_id()
self.q_id = q_id
self.s_id = s_id
self.identity = float(identity)
self.length = int(length)
self.mis = int(mis)
self.gap = int(gap)
self.evalue = float(evalue)
self.bscore = float(bscore)
# Autoadapt start and end so that start is always smaller than end
self.q_start = int(q_start) if int(q_start) < int(q_end) else int(q_end)
self.q_end = int(q_end) if int(q_start) < int(q_end) else int(q_start)
self.s_start = int(s_start) if int(s_start) < int(s_end) else int(s_end)
self.s_end = int(s_end) if int(s_start) < int(s_end) else int(s_start)
# Orientation of the query and subject along the hit. True if positive
self.q_orient = int(q_start) < int(q_end)
self.s_orient = int(s_start) < int(s_end)
# Add the instance to the class instance tracking list
self.Instances.append(self)
def __repr__(self):
msg = "HIT {}".format(self.id)
msg += "\tQuery\t{}:{}-{}({})\n".format(self.q_id, self.q_start, self.q_end, "+" if self.q_orient else "-")
msg += "\tSubject\t{}:{}-{}({})\n".format(self.s_id, self.s_start, self.s_end, "+" if self.q_orient else "-")
msg += "\tLenght : {}\tIdentity : {}%\tEvalue : {}\tBit score : {}\n".format(self.length, self.identity, self.evalue, self.bscore)
return (msg)
def __str__(self):
return "<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__)
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages import
from os import path
# Local library packages import
from pyDNA.Utilities import mkdir, import_seq, file_basename, file_name, file_extension, fgunzip
from BlastnWrapper import Aligner
from MakeblastdbWrapper import NewDB, ExistingDB
#~~~~~~~MAIN METHODS~~~~~~~#
def align (query_list,
subject_db = None,
subject_fasta = None,
aligner = "blastn",
align_opt="",
db_maker = "makeblastdb",
db_opt="",
db_outdir = "./blast_db/",
db_outname = "out"):
"""
Main function of RefMasker that integrate database creation, blast and homology masking
* Instantiate Blast database and blastn object
* Perform iterative blasts of query sequences against the subject database and create a list of
hits.
@param query_list List of paths indicating fasta files containing query sequences (can be
gzipped). Fasta can contains multiple sequences.
@param subject_db Basename of file from a blast database created by "makeblastdb" if available
@param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped)
@param aligner Path ot the blastn executable. Not required if blast+ if added to your path
@param blastn_opt Blastn command line options as a string
@param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path
@param db_opt makeblastdb command line options as a string
@param db_outdir Directory where to store the database files
@param db_outname Basename of the database files
@return A list of BlastHit objects
"""
# Try to import an existing database
try:
if not subject_db:
raise Exception("No Blast database was provided")
print("Existing database provided")
db = ExistingDB(subject_db)
# If no DB or if an error occured during validation of the existing DB = create a new db
except Exception as E:
print (E)
# Verify the presence of the reference fasta file
if not subject_fasta or not path.isfile (subject_fasta):
raise Exception("Invalid or no fasta file provided. Cannot create a database")
print ("Generate a database...")
mkdir(db_outdir)
db_path = path.join (db_outdir, db_outname)
# Create the new database
db = NewDB(ref_path=subject_fasta, db_path=db_path, makeblastdb_opt=db_opt, makeblastdb=db_maker)
# Initialise a Blastn object
blast = Aligner(db, align_opt, aligner)
#~print (repr(blast))
# Generate a list of hit containing hits of all sequence in query list in subject
hit_list = []
# Extend the list of hits for each query in a bigger list.
for query in query_list:
hit_list.extend(blast.align(query))
return hit_list
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages import
import gzip
from os import close, remove, path
from multiprocessing import cpu_count
from time import time
from tempfile import mkstemp
# Local library packages
from pyDNA.Utilities import run_command, file_basename, fgunzip
from BlastHit import BlastHit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Aligner(object):
"""
@class Aligner
@brief Perform de blastn of a DNA query against a blast database. If hits are found, a list of
BlastHit objects is returned.
Blast+ 2.8+ needs to be install and eventually added to the path.
"""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
def __repr__(self):
msg = "BLASTN WRAPPER\n"
msg += "Blastn path : {}\n".format(self.blastn)
msg += "Options : {}\n".format(self.blastn_opt)
msg += repr(self.Blastdb)
return msg
def __str__(self):
return "<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__)
def __init__ (self, Blastdb, blastn_opt="", blastn="blastn"):
"""
Initialize the object and index the reference genome if necessary
@param Blastdb Blast database object NewDB or ExistingDB
@param blastn_opt Blastn command line options as a string
@param blastn Path ot the bwa executable. If bwa if already added to your system path
do not change the default value
"""
# Creating object variables
self.blastn = blastn
self.Blastdb = Blastdb
# init an option dict and attribute defaut options
self.blastn_opt = "{} -num_threads {} -task {} -outfmt {} -dust {} -db {}".format(
blastn_opt, cpu_count(), "blastn", 6, "no", self.Blastdb.db_path)
#~~~~~~~PUBLIC METHODS~~~~~~~#
def align (self, query):
"""
Blast query against a subject database and return a list of BlastHit object
@param query Path to a fasta file containing the query sequences
@param evalue Cutoff used in blast to select valid hits
@return A list of BlastHit objects if at least one hit was found
@exception (SystemError,OSerror) May be returned by run_command in case of invalid command line
"""
# Build the command line string
query_name = file_basename(query)
# If the fasta file is compressed = extract the file in a temporary file
if query[-2:].lower() == "gz":
print ("Extracting the compressed fasta in a temporary file")
fd, tmp_path = mkstemp()
fgunzip (in_path=query, out_path=tmp_path)
blastn_opt = self.blastn_opt + " -query {}".format(tmp_path)
hits_list = self._align(query_name, blastn_opt)
close(fd)
remove(tmp_path)
return hits_list
# Else just proceed by using the fasta reference
else:
blastn_opt = self.blastn_opt + " -query {}".format(query)
return self._align(query_name, blastn_opt)
def _align (self, query_name, blastn_opt):
print ("Blast {} against {} database with blastn".format(query_name, file_basename (self.Blastdb.db_path))),
# Build the command line string
cmd = "{} {}".format(self.blastn, blastn_opt)
# Run the command line without stdin and asking only stdout
blast_lines = run_command(cmd, stdin=None, ret_stderr=False, ret_stdout=True).splitlines()
for line in blast_lines:
# Parse each result lines and create a BlastHit object
h = line.split()
BlastHit(h[0], h[1] , h[2], h[3], h[4], h[5], h[6], h[7], h[8], h[9], h[10], h[11])
# Sumarize the hit count in the different references
print ("\t{} hits found".format(BlastHit.count_total()))
# Get the list of hits from BlastHit class and reset the class list.
hits_list = BlastHit.get()
BlastHit.reset_list()
return hits_list
#########################################################################################
# #
# CONTAVECT CONFIGURATION FILE #
# #
#########################################################################################
#########################################################################################
# INSTRUCTIONS #
#########################################################################################
# Values can by customized with users values, but the file template must remain
# unchanged, otherwise the program will not be able to load default values.
# Please follow the following recommendations :
# -
# - File path should be indicated as absolute path preferably and should not contain
# blank spaces
# - Values identified with '**' in the descriptor are not recommended to be modified
#
#########################################################################################
# PARAMETERS #
#########################################################################################
#########################
# GENERAL SECTION #
#########################
# General parameters of the program
[General]
# Path where to create a folder in which all files generated during the program will
# be stored. Can be leaved empty if you want to work in the current directory (string)
outdir : ./test/
# String that will be used to prefix all data generated by the program. Cannot
# contain any "\" character. If leaved blank 'out' will be used as a prefix (string)
outprefix : AAV-100k_chr11-22
###############################
#  REFERENCE DEFINITIONS  #
###############################
# These sections corresponds to the definition of References sequence objects. They
# are defined by a name, a path to a fasta file that can contain multiple sequences,
# and finally a list of output files for this reference. To be properly interpreted
# Items in the list needs to be choose among :'bam', 'sam', 'bedgraph', 'bed',
# 'covgraph'.
# BAM and SAM correspond to the standard BAM/SAM format. They will be sorted and with
# a bam index. BAM is much more compact than SAM but not human readeable.
# BEDGRAPH and BAD are data recapitulating the coverage of reads over the sequence.
# bed report the coverage of all positions in the reference while bedgraph is more
# concise since uncovered regions are not reported and contigous positions of same
# coverage are reported only once. for large sequences bedgraph only should be used.
# COVGRAPH is a graphical representation of the coverage as a svg file and should be
# used only for small sized reference (> 50 000 pb). If selected the program will
# generate a graphics per sequence mapped in the reference fasta file.
# It is possible to include as many independant reference as required by duplicating
# a entire Ref section and updating the id number (Ref1, Ref2, Ref3 ... ). The order
# of references is CRITICAL if you want to perform a reference masking since it will
# start by the last reference masked by all the others then the penultimate masked by
# all others except the last and and so on until there is only one reference remaining
[Ref1]
# Name of the reference (string)
name : AAV
# Path to the reference fasta file that can contain multiple sequences.
# Can be gzipped (string)
fasta : ./demo/references/AAV-RSV-GFP.fa.gz
# List of output required for the reference separated by any blank space. See above
# for valid values (list of strings)
output : bam sam bedgraph bed covgraph
[Ref2]
name : Backbone
fasta : ./demo/references/Bacterial_backbone.fa.gz
output : bam sam bedgraph bed covgraph
[Ref3]
name : Helper
fasta : ./demo/references/Helper_plasmid.fa.gz
output : bam sam bedgraph bed covgraph
[Ref4]
name : Ad5
fasta : ./demo/references/Ad5_in_293_genome.fa.gz
output : bam bedgraph
#####################
#  FASTQ FILES  #
#####################
# This program is dedicated to paired en data only and will not work if a pair of fastq
# is not provided. Fastq can be pretreated for quality or adapter trimming or it can
# be done directly within the pipeline with an homemade efficient module.
[Fastq]
# Path to the file containing the forward reads (string)
R1 : ./demo/fastq/10k_AAV_R1.fastq.gz
# Path to the file containing the reverse reads (string)
R2 : ./demo/fastq/10k_AAV_R2.fastq.gz
#################################################
#  REFERENCE MASKING OPTIONS (FACULTATIVE)  #
#################################################
# This section group options to parameters a facultative step of homologies masking
# in ref as explain in the program documentation and above. It is based on the retrieval
# of similarities using blastn from blast+ followed by an hard masking of hits before
# rewritting of a new reference fasta file
[Ref_Masking]
# Flag to activate or deactivate the reference masking. If False the following options
of the section won't be parsed (Boolean)
ref_masking : True
# ** Command line options that will be used by blastn for alignment (string)
blastn_opt : -evalue 0.1
# ** Command line options that will be used by mkblastdb for database creation (string)
mkblastdb_opt :
# ** Path to the blastn executable (not required is added to system path) (string)
blastn : blastn
# ** Path to the mkblastdb executable (not required is added to system path) (string)
mkblastdb : mkblastdb
###############################################
#  FASTQ FILTERING OPTIONS (FACULTATIVE)  #
###############################################
# This section group options to parameters a facultative step of fastq filtering
# in ref as explain in the program documentation and above. 2 types o filtering can be
# made together or individually : A filtering of bad quality reads and a trimming of
# primers homologies found in reads.
[Fastq_Filtering]
# Quality format associated with reads autorized values are : 'fastq-sanger' 'fastq-solexa' or
# 'fastq-illumina'. See Biopython QualityIO documentation for more details. (string)
input_qual : fastq-sanger
# If True a quality filtering base on the general read quality will be performed (Boolean)
quality_filtering : True
# Minimal mean quality of a read to be kept for alignment (Integer)
min_qual : 25
# If True a step of adaptor trimming will be performed (Boolean)
adapter_trimming : True
# List of adapter sequences separated by a blank space (list of strings)
adapters : GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC
# Try to find matched for complementary sequence of all adapters (Boolean)
find_rc : True
# ** Minimal fraction of the read that should remain after adapter trimming (Positive Float)
min_read_len : 0.6
# ** Minimal fraction of the adapter that have to be aligned on reads to be consider as a valid match (Positive Float)
min_match_len : 0.8
# ** Minimal score per base of the adapter to be consider as a valid match (Positive Float)
min_match_score : 1.4
# ** Bonus score for SSW alignment in case of a match (Positive Integer)
ssw_match : 2
# ** Malus score for SSW alignment in case of a match (Positive Integer)
ssw_mismatch : 2
# ** Malus score for SSW alignment in case of gap opening (Positive Integer)
ssw_gap_open : 3
# ** Malus score for SSW alignment in case of gap extension(Positive Integer)
ssw_gap_extend : 1
#############################
#  BWA ALIGNMENT OPTIONS #
#############################
# This section group options to parameters bwa index and alignment with mem
[Bwa_Alignment]
# Path of a bwa index for the reference indicated above. If not available just leave parameter
# blank and a new index will be generated (string)
bwa_index :
# ** Command line options that will be used by bwa mem for alignment (string)
bwa_mem_opt : -M
# ** Command line options that will be used by bwa incdex for reference indexation (string)
bwa_index_opt :
# ** Path to bwa mem executable (not required is added to system path) (string)
bwa_aligner : bwa mem
# ** Path to bwa index executable (not required is added to system path) (string)
bwa_indexer : bwa index
#######################
#  OUTPUT OPTIONS #
#######################
# This section group Miscelianous options for configuring program output
[Output]
# This value correspond to the minimal quality mapping of a read to be considered valid (Positive Integer)
min_mapq : 30
# This value correspond to the minimal size of mapping (Positive integer)
min_size : 25
# If True Garbage reads (unmapped, low mapq and secondary) will be written in bam files (Boolean)
unmapped_bam = True
# If True Garbage reads (unmapped, low mapq and secondary) will be written in sam files (Boolean)
unmapped_sam = True
# Minimal depth of coverage to report the region in bedgraph, bed and covgraph (Positive Integer)
cov_min_depth : 4
# Minimal depth of coverage to report a variant position (Positive Integer)
var_min_depth : 500
# Minimal fraquency of a base at a given position to be considered as frequent (Positive Float)
var_min_freq : 0.2
# -*- coding: utf-8 -*-
"""
@package Quade
@brief Contain the template of the empty configuration file for Quade
@copyright [GNU General Public License v2](http://www.gnu.org/licenses/gpl-2.0.html)
@author Adrien Leger - 2014
* <adrien.leger@gmail.com>
* <adrien.leger@inserm.fr>
* <adrien.leger@univ-nantes.fr>
* [Github](https://github.com/a-slide)
* [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/)
"""
def write_example_conf():
with open ("RefMasker_conf_file.txt", 'wb') as fp:
fp.write ("""
###################################################################################################
# QUADE CONFIGURATION FILE #
###################################################################################################
# Values can by customized with users values, but the file template must remain unchanged,
# otherwise the program will not be able to load default values.
# File path should be indicated as absolute path preferably and should not contain blank spaces
# Values identified with '**' in the descriptor are not recommended to be modified
###################################################################################################
[Blast]
# Options to parameter blastn from blast+ used to find homologies between references
# Output only the best hit per query sequence in the query fasta file (BOOLEAN)
best_per_query_seq : True
# E Value cuttoff to retain alignments (FLOAT)
evalue : 0.1
# type of blast algorithm to perform 'blastn', 'blastn-short', 'dc-megablast', 'megablast',
# 'rmblastn'. Default = megablast
blast_task = dc-megablast
# Command line options that will be used by blastn for alignment (STRING) **
blastn_opt :
# Command line options that will be used by mkblastdb for database creation (STRING) **
mkblastdb_opt :
# Path to the blastn executable (not required is added to system path) (STRING) **
blastn : blastn
# Path to the mkblastdb executable (not required is added to system path) (STRING) **