Commit 3d63ddcd authored by Adrien Leger's avatar Adrien Leger
Browse files

Regroup code needed for RefMasker => Broken code

parent f748a153
# RefMasker
Mask homologies in fasta reference sequences based on an user provided ranking (IN DEVELOPMENT)
The order of references is CRITICAL
# if you want to perform a reference masking since it will
# start by the last reference masked by all the others then the penultimate masked by
# all others except the last and and so on until there is only one reference remaining
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class BlastHit(object):
"""
@class BlastHit
@brief Object oriented class containing informations of one blast hit
The following instance field are accessible :
* q_id : Query sequence name
* s_id : Subject sequence name
* identity : % of identity in the hit
* length : length of the hit
* mis : Number of mismatch in the hit
* gap : Number of gap in the hit
* q_orient : Orientation of the query along the hit
* q_start : Hit start position of the query
* q_end : Hit end position of the query
* s_orient : Orientation of the subject along the hit
* s_start : Hit start position of the subject
* s_end : Hit end position of the subject
* evalue : E value of the alignement
* bscore : Bit score of the alignement
A class list is used to track all instances generated.
"""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~CLASS FIELDS~~~~~~~#
Instances = [] # Class field used for instance tracking
id_count = 0
#~~~~~~~CLASS METHODS~~~~~~~#
@ classmethod
def next_id (self):
cur_id = self.id_count
self.id_count +=1
return cur_id
@ classmethod
def count_total (self):
"""
@return Overall number of BlastHit object in Instance list
"""
return (len(self.Instances))
@ classmethod
def stat_per_ref (self):
"""
@return Number of BlastHit object in Instance list sorted by reference subject sequence
"""
d = {}
for hit in self.Instances:
if hit.s_id in d:
d[hit.s_id][0] += 1
d[hit.s_id][1] += hit.length
else:
d[hit.s_id] = [1, hit.length]
return d
@ classmethod
def get (self):
"""
@return The list of all BlastHit object generated
"""
return self.Instances
@ classmethod
def get_ref (self, ref):
"""
@param ref Name of a reference sequence in the subject database
@return The list of all BlastHit object generated for this reference
"""
return [hit for hit in self.Instances if hit.s_id == "ref"]
@ classmethod
def reset_list (self):
"""
Reset the instance tracking list (Usefull after
"""
self.Instances = []
self.id_count = 0
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
def __init__(self, q_id, s_id, identity, length, mis, gap, q_start, q_end, s_start, s_end, evalue, bscore):
"""
Create a BlastHit object which is automatically added to the class tracking instance list
The object with the following parameters are required for object initialisation
@param q_id Query sequence name
@param s_id Subject sequence name
@param identity % of identity in the hit
@param length length of the hit
@param mis Number of mismatch in the hit
@param gap Number of gap in the hit
@param q_start Hit start position of the query
@param q_end Hit end position of the query
@param s_start Hit start position of the subject
@param s_end Hit end position of the subject
@param evalue E value of the alignement
@param bscore Bit score of the alignement
"""
self.id = self.next_id()
self.q_id = q_id
self.s_id = s_id
self.identity = float(identity)
self.length = int(length)
self.mis = int(mis)
self.gap = int(gap)
self.evalue = float(evalue)
self.bscore = float(bscore)
# Autoadapt start and end so that start is always smaller than end
self.q_start = int(q_start) if int(q_start) < int(q_end) else int(q_end)
self.q_end = int(q_end) if int(q_start) < int(q_end) else int(q_start)
self.s_start = int(s_start) if int(s_start) < int(s_end) else int(s_end)
self.s_end = int(s_end) if int(s_start) < int(s_end) else int(s_start)
# Orientation of the query and subject along the hit. True if positive
self.q_orient = int(q_start) < int(q_end)
self.s_orient = int(s_start) < int(s_end)
# Add the instance to the class instance tracking list
self.Instances.append(self)
def __repr__(self):
msg = "HIT {}".format(self.id)
msg += "\tQuery\t{}:{}-{}({})\n".format(self.q_id, self.q_start, self.q_end, "+" if self.q_orient else "-")
msg += "\tSubject\t{}:{}-{}({})\n".format(self.s_id, self.s_start, self.s_end, "+" if self.q_orient else "-")
msg += "\tLenght : {}\tIdentity : {}%\tEvalue : {}\tBit score : {}\n".format(self.length, self.identity, self.evalue, self.bscore)
return (msg)
def __str__(self):
return "<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__)
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages import
from os import path
# Local library packages import
from pyDNA.Utilities import mkdir, import_seq, file_basename, file_name, file_extension, fgunzip
from BlastnWrapper import Aligner
from MakeblastdbWrapper import NewDB, ExistingDB
#~~~~~~~MAIN METHODS~~~~~~~#
def align (query_list,
subject_db = None,
subject_fasta = None,
aligner = "blastn",
align_opt="",
db_maker = "makeblastdb",
db_opt="",
db_outdir = "./blast_db/",
db_outname = "out"):
"""
Main function of RefMasker that integrate database creation, blast and homology masking
* Instantiate Blast database and blastn object
* Perform iterative blasts of query sequences against the subject database and create a list of
hits.
@param query_list List of paths indicating fasta files containing query sequences (can be
gzipped). Fasta can contains multiple sequences.
@param subject_db Basename of file from a blast database created by "makeblastdb" if available
@param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped)
@param aligner Path ot the blastn executable. Not required if blast+ if added to your path
@param blastn_opt Blastn command line options as a string
@param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path
@param db_opt makeblastdb command line options as a string
@param db_outdir Directory where to store the database files
@param db_outname Basename of the database files
@return A list of BlastHit objects
"""
# Try to import an existing database
try:
if not subject_db:
raise Exception("No Blast database was provided")
print("Existing database provided")
db = ExistingDB(subject_db)
# If no DB or if an error occured during validation of the existing DB = create a new db
except Exception as E:
print (E)
# Verify the presence of the reference fasta file
if not subject_fasta or not path.isfile (subject_fasta):
raise Exception("Invalid or no fasta file provided. Cannot create a database")
print ("Generate a database...")
mkdir(db_outdir)
db_path = path.join (db_outdir, db_outname)
# Create the new database
db = NewDB(ref_path=subject_fasta, db_path=db_path, makeblastdb_opt=db_opt, makeblastdb=db_maker)
# Initialise a Blastn object
blast = Aligner(db, align_opt, aligner)
#~print (repr(blast))
# Generate a list of hit containing hits of all sequence in query list in subject
hit_list = []
# Extend the list of hits for each query in a bigger list.
for query in query_list:
hit_list.extend(blast.align(query))
return hit_list
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages import
import gzip
from os import close, remove, path
from multiprocessing import cpu_count
from time import time
from tempfile import mkstemp
# Local library packages
from pyDNA.Utilities import run_command, file_basename, fgunzip
from BlastHit import BlastHit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Aligner(object):
"""
@class Aligner
@brief Perform de blastn of a DNA query against a blast database. If hits are found, a list of
BlastHit objects is returned.
Blast+ 2.8+ needs to be install and eventually added to the path.
"""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
def __repr__(self):
msg = "BLASTN WRAPPER\n"
msg += "Blastn path : {}\n".format(self.blastn)
msg += "Options : {}\n".format(self.blastn_opt)
msg += repr(self.Blastdb)
return msg
def __str__(self):
return "<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__)
def __init__ (self, Blastdb, blastn_opt="", blastn="blastn"):
"""
Initialize the object and index the reference genome if necessary
@param Blastdb Blast database object NewDB or ExistingDB
@param blastn_opt Blastn command line options as a string
@param blastn Path ot the bwa executable. If bwa if already added to your system path
do not change the default value
"""
# Creating object variables
self.blastn = blastn
self.Blastdb = Blastdb
# init an option dict and attribute defaut options
self.blastn_opt = "{} -num_threads {} -task {} -outfmt {} -dust {} -db {}".format(
blastn_opt, cpu_count(), "blastn", 6, "no", self.Blastdb.db_path)
#~~~~~~~PUBLIC METHODS~~~~~~~#
def align (self, query):
"""
Blast query against a subject database and return a list of BlastHit object
@param query Path to a fasta file containing the query sequences
@param evalue Cutoff used in blast to select valid hits
@return A list of BlastHit objects if at least one hit was found
@exception (SystemError,OSerror) May be returned by run_command in case of invalid command line
"""
# Build the command line string
query_name = file_basename(query)
# If the fasta file is compressed = extract the file in a temporary file
if query[-2:].lower() == "gz":
print ("Extracting the compressed fasta in a temporary file")
fd, tmp_path = mkstemp()
fgunzip (in_path=query, out_path=tmp_path)
blastn_opt = self.blastn_opt + " -query {}".format(tmp_path)
hits_list = self._align(query_name, blastn_opt)
close(fd)
remove(tmp_path)
return hits_list
# Else just proceed by using the fasta reference
else:
blastn_opt = self.blastn_opt + " -query {}".format(query)
return self._align(query_name, blastn_opt)
def _align (self, query_name, blastn_opt):
print ("Blast {} against {} database with blastn".format(query_name, file_basename (self.Blastdb.db_path))),
# Build the command line string
cmd = "{} {}".format(self.blastn, blastn_opt)
# Run the command line without stdin and asking only stdout
blast_lines = run_command(cmd, stdin=None, ret_stderr=False, ret_stdout=True).splitlines()
for line in blast_lines:
# Parse each result lines and create a BlastHit object
h = line.split()
BlastHit(h[0], h[1] , h[2], h[3], h[4], h[5], h[6], h[7], h[8], h[9], h[10], h[11])
# Sumarize the hit count in the different references
print ("\t{} hits found".format(BlastHit.count_total()))
# Get the list of hits from BlastHit class and reset the class list.
hits_list = BlastHit.get()
BlastHit.reset_list()
return hits_list
#########################################################################################
# #
# CONTAVECT CONFIGURATION FILE #
# #
#########################################################################################
#########################################################################################
# INSTRUCTIONS #
#########################################################################################
# Values can by customized with users values, but the file template must remain
# unchanged, otherwise the program will not be able to load default values.
# Please follow the following recommendations :
# -
# - File path should be indicated as absolute path preferably and should not contain
# blank spaces
# - Values identified with '**' in the descriptor are not recommended to be modified
#
#########################################################################################
# PARAMETERS #
#########################################################################################
#########################
# GENERAL SECTION #
#########################
# General parameters of the program
[General]
# Path where to create a folder in which all files generated during the program will
# be stored. Can be leaved empty if you want to work in the current directory (string)
outdir : ./test/
# String that will be used to prefix all data generated by the program. Cannot
# contain any "\" character. If leaved blank 'out' will be used as a prefix (string)
outprefix : AAV-100k_chr11-22
###############################
#  REFERENCE DEFINITIONS  #
###############################
# These sections corresponds to the definition of References sequence objects. They
# are defined by a name, a path to a fasta file that can contain multiple sequences,
# and finally a list of output files for this reference. To be properly interpreted
# Items in the list needs to be choose among :'bam', 'sam', 'bedgraph', 'bed',
# 'covgraph'.
# BAM and SAM correspond to the standard BAM/SAM format. They will be sorted and with
# a bam index. BAM is much more compact than SAM but not human readeable.
# BEDGRAPH and BAD are data recapitulating the coverage of reads over the sequence.
# bed report the coverage of all positions in the reference while bedgraph is more
# concise since uncovered regions are not reported and contigous positions of same
# coverage are reported only once. for large sequences bedgraph only should be used.
# COVGRAPH is a graphical representation of the coverage as a svg file and should be
# used only for small sized reference (> 50 000 pb). If selected the program will
# generate a graphics per sequence mapped in the reference fasta file.
# It is possible to include as many independant reference as required by duplicating
# a entire Ref section and updating the id number (Ref1, Ref2, Ref3 ... ). The order
# of references is CRITICAL if you want to perform a reference masking since it will
# start by the last reference masked by all the others then the penultimate masked by
# all others except the last and and so on until there is only one reference remaining
[Ref1]
# Name of the reference (string)
name : AAV
# Path to the reference fasta file that can contain multiple sequences.
# Can be gzipped (string)
fasta : ./demo/references/AAV-RSV-GFP.fa.gz
# List of output required for the reference separated by any blank space. See above
# for valid values (list of strings)
output : bam sam bedgraph bed covgraph
[Ref2]
name : Backbone
fasta : ./demo/references/Bacterial_backbone.fa.gz
output : bam sam bedgraph bed covgraph
[Ref3]
name : Helper
fasta : ./demo/references/Helper_plasmid.fa.gz
output : bam sam bedgraph bed covgraph
[Ref4]
name : Ad5
fasta : ./demo/references/Ad5_in_293_genome.fa.gz
output : bam bedgraph
#####################
#  FASTQ FILES  #
#####################
# This program is dedicated to paired en data only and will not work if a pair of fastq
# is not provided. Fastq can be pretreated for quality or adapter trimming or it can
# be done directly within the pipeline with an homemade efficient module.
[Fastq]
# Path to the file containing the forward reads (string)
R1 : ./demo/fastq/10k_AAV_R1.fastq.gz
# Path to the file containing the reverse reads (string)
R2 : ./demo/fastq/10k_AAV_R2.fastq.gz
#################################################
#  REFERENCE MASKING OPTIONS (FACULTATIVE)  #
#################################################
# This section group options to parameters a facultative step of homologies masking
# in ref as explain in the program documentation and above. It is based on the retrieval
# of similarities using blastn from blast+ followed by an hard masking of hits before
# rewritting of a new reference fasta file
[Ref_Masking]
# Flag to activate or deactivate the reference masking. If False the following options
of the section won't be parsed (Boolean)
ref_masking : True
# ** Command line options that will be used by blastn for alignment (string)
blastn_opt : -evalue 0.1
# ** Command line options that will be used by mkblastdb for database creation (string)
mkblastdb_opt :
# ** Path to the blastn executable (not required is added to system path) (string)
blastn : blastn
# ** Path to the mkblastdb executable (not required is added to system path) (string)
mkblastdb : mkblastdb
###############################################
#  FASTQ FILTERING OPTIONS (FACULTATIVE)  #
###############################################
# This section group options to parameters a facultative step of fastq filtering
# in ref as explain in the program documentation and above. 2 types o filtering can be
# made together or individually : A filtering of bad quality reads and a trimming of
# primers homologies found in reads.
[Fastq_Filtering]
# Quality format associated with reads autorized values are : 'fastq-sanger' 'fastq-solexa' or
# 'fastq-illumina'. See Biopython QualityIO documentation for more details. (string)
input_qual : fastq-sanger
# If True a quality filtering base on the general read quality will be performed (Boolean)
quality_filtering : True
# Minimal mean quality of a read to be kept for alignment (Integer)
min_qual : 25
# If True a step of adaptor trimming will be performed (Boolean)
adapter_trimming : True
# List of adapter sequences separated by a blank space (list of strings)
adapters : GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC
# Try to find matched for complementary sequence of all adapters (Boolean)
find_rc : True
# ** Minimal fraction of the read that should remain after adapter trimming (Positive Float)
min_read_len : 0.6
# ** Minimal fraction of the adapter that have to be aligned on reads to be consider as a valid match (Positive Float)
min_match_len : 0.8
# ** Minimal score per base of the adapter to be consider as a valid match (Positive Float)
min_match_score : 1.4
# ** Bonus score for SSW alignment in case of a match (Positive Integer)
ssw_match : 2
# ** Malus score for SSW alignment in case of a match (Positive Integer)
ssw_mismatch : 2
# ** Malus score for SSW alignment in case of gap opening (Positive Integer)
ssw_gap_open : 3
# ** Malus score for SSW alignment in case of gap extension(Positive Integer)
ssw_gap_extend : 1
#############################
#  BWA ALIGNMENT OPTIONS #
#############################
# This section group options to parameters bwa index and alignment with mem
[Bwa_Alignment]
# Path of a bwa index for the reference indicated above. If not available just leave parameter
# blank and a new index will be generated (string)
bwa_index :
# ** Command line options that will be used by bwa mem for alignment (string)
bwa_mem_opt : -M
# ** Command line options that will be used by bwa incdex for reference indexation (string)
bwa_index_opt :
# ** Path to bwa mem executable (not required is added to system path) (string)
bwa_aligner : bwa mem
# ** Path to bwa index executable (not required is added to system path) (string)
bwa_indexer : bwa index
#######################
#  OUTPUT OPTIONS #
#######################
# This section group Miscelianous options for configuring program output
[Output]
# This value correspond to the minimal quality mapping of a read to be considered valid (Positive Integer)
min_mapq : 30
# This value correspond to the minimal size of mapping (Positive integer)
min_size : 25
# If True Garbage reads (unmapped, low mapq and secondary) will be written in bam files (Boolean)
unmapped_bam = True
# If True Garbage reads (unmapped, low mapq and secondary) will be written in sam files (Boolean)
unmapped_sam = True
# Minimal depth of coverage to report the region in bedgraph, bed and covgraph (Positive Integer)
cov_min_depth : 4
# Minimal depth of coverage to report a variant position (Positive Integer)
var_min_depth : 500
# Minimal fraquency of a base at a given position to be considered as frequent (Positive Float)
var_min_freq : 0.2
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages import
import gzip
from os import close, remove, path
from time import time
from tempfile import mkstemp
# Local library packages
from pyDNA.Utilities import run_command, file_basename, fgunzip
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class NewDB(object):
"""
@class NewDB
@brief Wrapper for makeblastdb index. Create a subject database from a fasta file.
Blast+ 2.8+ needs to be install and correctly added to the path.
"""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
def __repr__(self):
msg = "MAKEBLASTDB WRAPPER (NEW DB)\n"
msg += "Makeblastdb path : {}\n".format(self.makeblastdb)
msg += "Blastn database path : {}\n".format(self.db_path)
msg += "Options : {}\n".format(self.makeblastdb_opt)
return msg
def __str__(self):
return "\n<Instance of {} from {} >\n".format(self.__class__.__name__, self.__module__)
def __init__ (self, ref_path, db_path="./out", makeblastdb_opt="", makeblastdb="makeblastdb"):
"""
Create a blastdb from a reference fastq file
@param ref_path Path of the fasta file containing the reference sequence. Can be gzipped
but in this case the compressed file will be extracted in a temporary file
@param db_path Outname for the blast db files basename.
@param makeblastdb_opt makeblastdb command line options as a string
@param makeblastdb Path ot the makeblastdb executable. If blast+ if already added to your
system path do not change the default value
"""
# Creating object variables
self.makeblastdb = makeblastdb
self.db_path = db_path
self.db_name = file_basename(ref_path)
# init an option dict and attribute defaut options
self.makeblastdb_opt = "{} -out {} -dbtype {} -input_type {} ".format (
makeblastdb_opt, self.db_path, "nucl", "fasta")
try:
# If the fasta file is compressed = extract the file in a tempory file
if ref_path[-2:].lower() == "gz":
print ("Extracting the compressed fasta in a temporary file")
fd, tmp_path = mkstemp()
fgunzip (in_path=ref_path, out_path=tmp_path)
self.makeblastdb_opt += "-in {}".format(tmp_path)
self._make_db()
close(fd)
remove(tmp_path)
# Else just proceed by using the fasta reference
else:
self.makeblastdb_opt += "-in {}".format(ref_path)
self._make_db()
except Exception as E:
self._remove_db_files()
raise Exception (E.message+"Impossible to generate a valid database from the reference sequence")
#~~~~~~~PRIVATE METHODS~~~~~~~#
def _make_db(self):
"""
Create a blastn database from ref_path using makeblastdb
"""
# Build the command line
cmd = "{} {}".format(self.makeblastdb, self.makeblastdb_opt)