Snakefile 13.5 KB
Newer Older
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
1
2
3
import os
import sys
import glob
4
import re
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18

####################	 FUNCTIONS	############################


# retrieves all forward/reverse fastq files from config file based on the sample name (list of all chunks)
def getFastqs(wildcards):
	for s in config["samples"]:
		if(s["sample"] == wildcards.sampleName):
			chunks=list()
			if(wildcards.read == "R1"):
				for pair in s["files"]:
					chunks.append(pair["forward"]["path"])
			if(wildcards.read == "R2"):
				for pair in s["files"]:
19
					chunks.append(pair["reverse"]["path"])
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
			return chunks

def getFastqPair(wildcards):
	for s in config["samples"]:
		if(s["sample"] == wildcards.sampleName):
			if(wildcards.strand=="R1"):
				return s["files"][int(wildcards.pairNumber)]["forward"]["path"]
			elif(wildcards.strand=="R2"):
				return s["files"][int(wildcards.pairNumber)]["reverse"]["path"]


def getCleanedFastq(wildcards):
	fastqs = list()
	for s in config["samples"]:
		if(s["sample"] == wildcards.sampleName):
			nbPairs = len(s["files"])
			if(wildcards.read == "R1"):
				for nb in range(nbPairs):
					fastqs.append(OUTPUTDIR+"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R1.fastq.gz")
			if(wildcards.read == "R2"):
				for nb in range(nbPairs):
					fastqs.append(OUTPUTDIR+"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R2.fastq.gz")
42
	fastqs.sort()
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
43
44
45
46
	return fastqs

def getConditions():
	sampleList = list()
47
48
49
50
51
52
	for s in config["samplesCondition"]:
		sd = dict()
		sd["samplename"] = s["name"]
		sd["filename"] = s["name"]
		sd["condition"] = s["condition"]
		sampleList.append(sd)
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
53
54
	return sampleList

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def getConditionForComp(wildcards):
	s = re.search("(^[\w-]+)__vs__([\w-]+)$", wildcards.comp)
	d = dict()
	d["cond1"] = s.group(1)
	d["cond2"] = s.group(2)
	#return {"cond1":s.group(1),"cond2":s.group(2)}
	return d

## Fonction pour {wildcards.comp}
#def getTargetAll{wildcards.comp}
#	fastqs = list{wildcards.comp}
#	for s in conf{wildcards.comp}
#		nbPairs ={wildcards.comp}
#		for nb in{wildcards.comp}
#			fastq{wildcards.comp}"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R1.fastq.gz")
#			fastq{wildcards.comp}"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R2.fastq.gz")
#	return fastqs{wildcards.comp}
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
72
73

####################	 CMD to launch pipeline   ############################
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
74
## snakemake --config proj="project.json" conf="config.json" -nrp
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
75
76

########## loading configuration variable ##################
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
77
78
configfile: config["proj"]
configfile: config["conf"]
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
79
80

########## define global variables ##########
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
81
SNAKEFILEDIR = workflow.basedir
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
82
OUTPUTDIR = os.path.abspath(config["outdir"])
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
83
84
SCRIPTPATH = os.path.join(SNAKEFILEDIR,"scripts")

85
86
87
GTF = config["reference"]["gtf"]

FASTA = config["reference"]["fasta"]
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
88
FASTABASENAME = os.path.splitext(FASTA)[0]
89
90

BIOMART = config["reference"]["biomart"]
91
REFNAME = config["reference"]["name"]
92
93
94

GENOMEDIR=config["reference"]["STARindexDir"]

Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
95
96
FWADAPT=config["cutadapt-forward"]
RVADAPT=config["cutadapt-reverse"]
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
97
98
99
100
101

## Overhang calculation for the generation of the reference files in STAR
OVERHANG = int(config["read-length"]) - 1

# List samples to trigger the target rule to create
102
SAMPLESALL = list()
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
103
for s in config["samples"]:
104
105
106
107
	SAMPLESALL.append(s["sample"])

# List comparisons to trigger the target rule
COMPARISONS = [c for c in config["comparisons"]]
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
108
109

# List of the direction of the read
110
READS=["R1","R2"]
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
111
112
113
114
115


###############################################
###############################################

116
rule all:
117
	input:
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
118
	 	qc=OUTPUTDIR+"/Report/data/general/fastqc/multiqc_report.html",
119
120
	 	#fastqc=expand(OUTPUTDIR+"/Samples/{sampleName}/FASTQC_cleaned/{sampleName}_{read}.concat_fastqc.html",sampleName=SAMPLESALL,read=READS)
	 	index=expand(OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam.bai",sampleName=SAMPLESALL) ,
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
121
122
	 	#deg=expand(OUTPUTDIR+"/DESEQ2/results/{suffixe}",suffixe=["NormalizedCountMatrix.txt","NormalizedCountMatrixFiltered.txt","PCAplot.png","sampletosampledistance.jpeg"])
		rep=OUTPUTDIR+"/Report/report.html"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
123
124

rule fastqc:
125
	input: 	OUTPUTDIR+"/Samples/{sampleName}/FASTQC/{sampleName}_{read}.fastq.gz"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
126
	output: OUTPUTDIR+"/Samples/{sampleName}/FASTQC/{sampleName}_{read}_fastqc.html"
127
128
129
130
	params:	outFolder = OUTPUTDIR+"/Samples/{sampleName}/FASTQC"
	shell: 	"""
			fastqc -o {params.outFolder} {input}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
131

132
rule fastqc_cleaned:
133
	input:	OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{read}.concat.fastq.gz"
134
	output: OUTPUTDIR+"/Samples/{sampleName}/FASTQC_cleaned/{sampleName}_{read}.concat_fastqc.html"
135
136
137
138
	params:	outFolder = OUTPUTDIR+"/Samples/{sampleName}/FASTQC_cleaned"
	shell: 	"""
			fastqc -o {params.outFolder} {input}
			"""
139

Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
140
rule concat_fastq_chunks:
141
	input: 	getFastqs
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
142
143
144
145
146
147
	output: temp(OUTPUTDIR+"/Samples/{sampleName}/FASTQC/{sampleName}_{read}.fastq.gz")
	run:
		chunks = ' '.join(input)
		shell("cat {chunks} > {output}")

rule multiQC:
148
149
150
	input: 	f1=expand(OUTPUTDIR+"/Samples/{sampleName}/FASTQC/{sampleName}_{read}_fastqc.html", sampleName=SAMPLESALL, read=READS),
			f2=expand(OUTPUTDIR+"/Samples/{sampleName}/FASTQC_cleaned/{sampleName}_{read}.concat_fastqc.html", sampleName=SAMPLESALL, read=READS),
			bam=expand(OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam", sampleName=SAMPLESALL)
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
151
	output: OUTPUTDIR+"/Report/data/general/fastqc/multiqc_report.html"
152
153
154
	shell: 	"""
			multiqc -f -e general_stats -e tophat -e bowtie2 {OUTPUTDIR}/Samples -o {OUTPUTDIR}/Report/data/general/fastqc
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
155
156
157
158
159

rule dezipFastq:
	input:	getFastqPair
	output:	temp(OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_{strand}.temp.fastq")
	shell:	"""
160
161
			gzip -dc {input} > {output}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
162
163

rule prinseq:
164
	input: 	expand(OUTPUTDIR+"/Samples/{{sampleName}}/PRINSEQ/{{sampleName}}_{{pairNumber}}_{strand}.temp.fastq", strand=READS)
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
165
	output:	temp(expand(OUTPUTDIR+"/Samples/{{sampleName}}/PRINSEQ/{{sampleName}}_{{pairNumber}}_{suffixe}.fastq", suffixe=["bad_1","bad_2","good_1","good_2","good_1_singletons","good_2_singletons"])),
166
			log = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}.log"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
167
	params:	preGood = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_good",
168
			preBad = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_bad"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
169
	shell:	"""
170
171
172
173
			perl `which prinseq-lite.pl` -min_qual_mean 30 -no_qual_header -fastq {input[0]} -fastq2 {input[1]} -out_good {params.preGood} -out_bad {params.preBad} -log {output.log}
			if(! test -f {params.preBad}_1.fastq);then touch {params.preBad}_1.fastq {params.preGood}_1_singletons.fastq;fi
			if(! test -f {params.preBad}_2.fastq);then touch {params.preBad}_2.fastq {params.preGood}_2_singletons.fastq;fi
			"""
174

Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
175
176
rule concatPrintseqOutput:
	input:	bad = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_bad_{strand}.fastq",
177
178
			good = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_good_{strand}.fastq",
			singleton = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_good_{strand}_singletons.fastq"
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
179
	output:	temp(OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_R{strand}.fastq.gz")
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
180
181
	params: tmpdir = OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/"
	shell:	"""
182
183
			awk '{{if(NR%2==1){{print}} else{{lineLength=length();if(NR%4==0){{curCar="#"}}else{{curCar="N"}};for(c=0;c<lineLength;c++)printf(curCar); printf "\\n"}}}}' {input.bad} | cat {input.good} {input.singleton} - | paste - - - - | LC_ALL=C sort -k1,1 -t " " -T {params.tmpdir} | tr "\\t" "\\n" | gzip --best - > {output}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
184
185

rule cutadaptR1:
186
	input:	OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_R1.fastq.gz"
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
187
	output:	fastq = temp(OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{pairNumber}_R1.fastq.gz"),
188
			report = OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{pairNumber}_R1.fastq.report"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
189
	shell:	"""
190
191
			cutadapt -a {FWADAPT} 2> {output.report} {input} | java -jar {SCRIPTPATH}/pademptyfastq-fat.jar 2> /dev/null | gzip --best -c > {output.fastq}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
192
193

rule cutadaptR2:
194
	input:	OUTPUTDIR+"/Samples/{sampleName}/PRINSEQ/{sampleName}_{pairNumber}_R2.fastq.gz"
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
195
	output:	fastq = temp(OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{pairNumber}_R2.fastq.gz"),
196
			report = OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{pairNumber}_R2.fastq.report"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
197
	shell:	"""
198
199
			cutadapt -a {RVADAPT} 2> {output.report} {input} | java -jar {SCRIPTPATH}/pademptyfastq-fat.jar 2> /dev/null | gzip --best -c > {output.fastq}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
200
201

rule concat_cleanedfastq_chunks:
202
	input: 	getCleanedFastq
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
203
	output: OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_{read}.concat.fastq.gz"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
204
205
206
207
208
209
	run:
		chunks = ' '.join(input)
		shell("cat {chunks} > {output}")


rule genomeFiles:
210
	input:	GTF,
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
211
    		FASTA
212
213
214
	output: GENOMEDIR+"/Genome"
	params: cpu=config["align-cpu"]
	shell: 	"""
215
			STAR --runThreadN {params.cpu} --runMode genomeGenerate --genomeDir {GENOMEDIR} --genomeFastaFiles {FASTA} --sjdbGTFfile {GTF} --sjdbOverhang {OVERHANG} --limitGenomeGenerateRAM 95543555797
216
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
217
218

rule star_PE:
219
220
221
222
223
	input:
        	R1=OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_R1.concat.fastq.gz",
	        R2=OUTPUTDIR+"/Samples/{sampleName}/CUTADAPT/{sampleName}_R2.concat.fastq.gz",
	        genome=GENOMEDIR+"/Genome"
	output: OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam"
224
225
226
227
	params:	cpu=config["align-cpu"]
	shell:	"""
			STAR --runThreadN {params.cpu} --genomeDir {GENOMEDIR} --readFilesCommand zcat --outFileNamePrefix {OUTPUTDIR}/Samples/{wildcards.sampleName}/STAR/{wildcards.sampleName} --readFilesIn {input.R1} {input.R2} --outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 6
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
228
229
230


rule createBamIndex:
231
	input: 	OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
232
	output: OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam.bai"
233
234
235
	shell: 	"""
			samtools index {input} {output}
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
236
237
238


rule htseq:
239
	input: 	OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
240
	output: OUTPUTDIR+"/DESEQ2/counts/{sampleName}"
241
242
	params:	library_type=config["library-type"]
	shell:  """
243
244
245
246
247
248
249
250
251
252
			if [ {params.library_type} = "reverse" ]
			then
				htseq-count -s reverse -f bam {input} {GTF} > {output}
			elif [ {params.library_type} = "yes" ]
			then
				htseq-count -s yes -f bam {input} {GTF} > {output}
			else
				htseq-count -s no -f bam {input} {GTF} > {output}
			fi
			"""
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
253
254
255

rule deseq2_conditions:
	output: tab = OUTPUTDIR+"/DESEQ2/DESEQ2_CONDITIONS.tab"
256
	run:
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
257
258
259
260
261
262
263
		condArray = getConditions()
		with open(output.tab, 'w') as condfile:
			condfile.write("samplename"+"\t"+"filename"+"\t"+"condition"+"\n")
			for s in condArray:
				condfile.write(s["samplename"]+"\t"+s["filename"]+"\t"+s["condition"]+"\n")

rule deseq2:
264
265
266
267
	input:	conditions=OUTPUTDIR+"/DESEQ2/DESEQ2_CONDITIONS.tab",
			counts=expand(OUTPUTDIR+"/DESEQ2/counts/{sampleName}",sampleName=SAMPLESALL)
	output: expand(OUTPUTDIR+"/DESEQ2/results/{suffixe}",suffixe=["NormalizedCountMatrix.txt","NormalizedCountMatrixFiltered.txt","PCAplot.png","sampletosampledistance.jpeg"]),
			OUTPUTDIR+"/DESEQ2/rData/dds.RData"
268
269
	params:	outdir = OUTPUTDIR+"/DESEQ2/results",
			countsdir = OUTPUTDIR+"/DESEQ2/counts"
Audrey BIHOUEE's avatar
Audrey BIHOUEE committed
270
	shell:	"""
271
			source activate rnaDE
272
			cat {SCRIPTPATH}/run_deseq2.R | R --slave --args {input.conditions} {params.countsdir} {params.outdir} {BIOMART} 
273
274
275
276
			"""

rule deg:
	input:	rdata=OUTPUTDIR+"/DESEQ2/rData/dds.RData"
277
	output:	expand(OUTPUTDIR+"/DESEQ2/results/{{comp}}/{{comp}}_{file}",file=["DEseqRes.tsv","All_DEG.tsv","UP_DEG.tsv","DOWN_DEG.tsv","Volcano-plot.png","MA-plot.png","clustDEgene.png"])
278
279
	params:	cond1=lambda wildcards : getConditionForComp(wildcards)["cond1"],
			cond2=lambda wildcards : getConditionForComp(wildcards)["cond2"]
280
	shell:	"""
281
			source activate rnaDE
282
			cat {SCRIPTPATH}/deg.R | R --slave --args {input.rdata} {params.cond1} {params.cond2}
283
			"""
284
			
285
286
287
288
rule annot:
	input:	expand(OUTPUTDIR+"/DESEQ2/results/{{comp}}/{{comp}}_{file}", file=["All_DEG.tsv","DEseqRes.tsv"]),
			corrAnnotations=(SCRIPTPATH+"/corresIDorg.txt"),
			rdata=OUTPUTDIR+"/DESEQ2/rData/dds.RData"
289
	output:	expand(OUTPUTDIR+"/DESEQ2/results/{{comp}}/{{comp}}_{file}",file=["gseGo.txt","gseKegg.txt"])
290
291
	params:	cond1=lambda wildcards : getConditionForComp(wildcards)["cond1"],
			cond2=lambda wildcards : getConditionForComp(wildcards)["cond2"]
292
	shell:	"""
293
			source activate rnaDE
294
			cat {SCRIPTPATH}/annot.R | R --slave --args {input.rdata} {params.cond1} {params.cond2} {REFNAME} {input.corrAnnotations}
295
			"""
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
296
297

rule copyDataForReport:
298
	input:
299
			expand(OUTPUTDIR+"/DESEQ2/results/{suffixe}",suffixe=["NormalizedCountMatrix.txt","NormalizedCountMatrixFiltered.txt","PCAplot.png","sampletosampledistance.jpeg"]),
300
			expand(OUTPUTDIR+"/DESEQ2/results/{comp}/{comp}_{file}",file=["gseGo.txt","gseKegg.txt","DEseqRes.tsv","All_DEG.tsv","UP_DEG.tsv","DOWN_DEG.tsv","Volcano-plot.png","MA-plot.png","clustDEgene.png"], comp=COMPARISONS)
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
301
	output:
302
			temp(touch(OUTPUTDIR+"/Report/TEMPLATE/templateCopied.txt"))
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
303
	params:
304
305
			templateFolder = OUTPUTDIR+"/Report/TEMPLATE",
			deseqFolder = OUTPUTDIR+"/Report/DESEQ2"
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
306
	shell:
307
308
309
310
311
312
313
314
			"""
			if test -d {params.templateFolder}; then rm -rf {params.templateFolder};fi
			mkdir {params.templateFolder}
			cp -r {SNAKEFILEDIR}/TEMPLATE/assets {SNAKEFILEDIR}/TEMPLATE/images {params.templateFolder}
			if test -d {params.deseqFolder}; then rm -rf {params.deseqFolder};fi
			mkdir {params.deseqFolder}
			cp -r {OUTPUTDIR}/DESEQ2/results {params.deseqFolder}
			"""
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
315
316

rule buildReport:
317
318
	input:	OUTPUTDIR+"/Report/TEMPLATE/templateCopied.txt"
	output:	report = OUTPUTDIR+"/Report/report.html"
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
319
	shell:
320
			"""
Eric CHARPENTIER's avatar
Eric CHARPENTIER committed
321
			python {SNAKEFILEDIR}/scripts/make_html.py -c {config[conf]} -s {config[proj]} -t {SNAKEFILEDIR}/TEMPLATE/index.html -o {output.report}
322
			"""