Commit 80a239b7 authored by Eric CHARPENTIER's avatar Eric CHARPENTIER 🐍
Browse files

added fpkm and tpm counts

parent 8f04370f
......@@ -18,6 +18,7 @@ dependencies:
- snakemake-minimal>=5.2
- star=2.7.3a
- openjdk=8.0
- rseqc
- simplejson
- urllib3
- gzip
......
......@@ -21,7 +21,7 @@ The main steps of the pipeline are:
### System requirements
The only requirement is to have a working install of [conda](https://www.anaconda.com/download/#linux) and [git](https://git-scm.com/downloads).
The species specific resources files (reference genome, gtf) have to be downloaded manually if not human.
The species specific resources files (reference genome, gtf) have to be downloaded manually.
### Cloning the repository
......@@ -129,7 +129,7 @@ snakemake --config proj="project.json" conf="config.json" --cluster "qsub -e ./l
> **Note:**
> - Use the "-j" option (number of parallel jobs to run) appropriately
> - If your cluster is not managed by SGE, you will have to build a wrapper for the jobs and specify it with the option "--jobscript".
> - If your cluster is not managed by SGE, you will have to build a wrapper for the jobs and specify it with the option `--jobscript`.
> - In order to avoid filling all the available RAM, you can specify `--resources parallel_star=X` where `X` is the number of parallel STAR alignment you want to run. Alors note that each STAR job uses 10 threads by default.
# Quick launch guide on provided test data
......
......@@ -2,6 +2,8 @@ import os
import sys
import glob
import re
import pandas as pd
from pathlib import Path
#################### FUNCTIONS ############################
......@@ -57,23 +59,12 @@ def getConditionForComp(wildcards):
d = dict()
d["cond1"] = s.group(1)
d["cond2"] = s.group(2)
#return {"cond1":s.group(1),"cond2":s.group(2)}
return d
## Fonction pour {wildcards.comp}
#def getTargetAll{wildcards.comp}
# fastqs = list{wildcards.comp}
# for s in conf{wildcards.comp}
# nbPairs ={wildcards.comp}
# for nb in{wildcards.comp}
# fastq{wildcards.comp}"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R1.fastq.gz")
# fastq{wildcards.comp}"/Samples/"+s["sample"]+"/CUTADAPT/"+s["sample"]+"_"+str(nb)+"_R2.fastq.gz")
# return fastqs{wildcards.comp}
#################### CMD to launch pipeline ############################
## snakemake --config proj="project.json" conf="config.json" -nrp
########## loading configuration variable ##################
########## loading configuration files ##################
configfile: config["proj"]
configfile: config["conf"]
......@@ -120,6 +111,7 @@ rule all:
index=expand(OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam.bai",sampleName=SAMPLESALL),
#htseq=expand(OUTPUTDIR+"/DESEQ2/counts/{sampleName}",sampleName=SAMPLESALL)
deg=expand(OUTPUTDIR+"/DESEQ2/results/{suffixe}",suffixe=["NormalizedCountMatrix.txt","NormalizedCountMatrixFiltered.txt","PCAplot.png","sampletosampledistance.jpeg"]),
fpkm=OUTPUTDIR+"/FPKM/tpm.tsv",
rep=OUTPUTDIR+"/Report/report.html"
rule fastqc:
......@@ -236,7 +228,6 @@ rule createBamIndex:
samtools index {input} {output}
"""
rule htseq:
input: OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam"
output: OUTPUTDIR+"/DESEQ2/counts/{sampleName}"
......@@ -253,6 +244,44 @@ rule htseq:
fi
"""
rule makeBedFromGtf:
input: {GTF}
output: OUTPUTDIR+"/FPKM/genes.sorted.bed12"
params: outdir = OUTPUTDIR+"/FPKM/"
shell: """
{SCRIPTPATH}/gtfToGenePred {input} {params.outdir}/genes.genePred
{SCRIPTPATH}/genePredToBed {params.outdir}/genes.genePred {params.outdir}/genes.combined.bed12
sort -T {params.outdir} -k1,1 -k2,2n {params.outdir}/genes.combined.bed12 > {output}
"""
rule fpkm:
input: bam = OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam",
bai = OUTPUTDIR+"/Samples/{sampleName}/STAR/{sampleName}Aligned.sortedByCoord.out.bam.bai",
bed = OUTPUTDIR+"/FPKM/genes.sorted.bed12"
output: OUTPUTDIR+"/Samples/{sampleName}/RSEQC/{sampleName}.FPKM.xls"
params: outbase = OUTPUTDIR+"/Samples/{sampleName}/RSEQC/{sampleName}"
shell: """
FPKM_count.py -i {input.bam} -o {params.outbase} -r {input.bed}
"""
rule joinFPKM:
input: expand(OUTPUTDIR+"/Samples/{sampleName}/RSEQC/{sampleName}.FPKM.xls", sampleName=SAMPLESALL)
output: OUTPUTDIR+"/FPKM/fpkm.tsv"
run:
frames = list()
for f in input:
df = pd.read_csv(f, header=0, sep="\t", index_col="accession", usecols=["accession","FPKM"])
df.columns = [Path(str(f)).with_suffix('').with_suffix('').stem]
frames.append(df)
results = pd.concat(frames, axis=1)
results.to_csv(str(output), sep="\t", na_rep="0.0", index_label="transcript")
rule FPKMtoTPM:
input: OUTPUTDIR+"/FPKM/fpkm.tsv"
output: OUTPUTDIR+"/FPKM/tpm.tsv"
shell: """
cat {SCRIPTPATH}/fpkm2tpm.R | R --slave --args {input} {output}
"""
rule deseq2_conditions:
output: tab = OUTPUTDIR+"/DESEQ2/DESEQ2_CONDITIONS.tab"
run:
......@@ -270,7 +299,7 @@ rule deseq2:
params: outdir = OUTPUTDIR+"/DESEQ2/results",
countsdir = OUTPUTDIR+"/DESEQ2/counts"
shell: """
cat {SCRIPTPATH}/run_deseq2.R | R --slave --args {input.conditions} {params.countsdir} {params.outdir} {BIOMART}
cat {SCRIPTPATH}/run_deseq2.R | R --slave --args {input.conditions} {params.countsdir} {params.outdir} {BIOMART}
"""
rule deg:
......@@ -281,11 +310,18 @@ rule deg:
shell: """
cat {SCRIPTPATH}/deg.R | R --slave --args {input.rdata} {params.cond1} {params.cond2}
"""
rule downloadAnnotRef:
output: temp(touch(OUTPUTDIR+"/DESEQ2/annotDbInstalled.tmp"))
shell: """
cat {SCRIPTPATH}/installAnnotDatabase.R | R --slave --args {REFNAME} {SCRIPTPATH}/corresIDorg.txt
"""
rule annot:
input: expand(OUTPUTDIR+"/DESEQ2/results/{{comp}}/{{comp}}_{file}", file=["All_DEG.tsv","DEseqRes.tsv"]),
corrAnnotations=(SCRIPTPATH+"/corresIDorg.txt"),
rdata=OUTPUTDIR+"/DESEQ2/rData/dds.RData"
rdata=OUTPUTDIR+"/DESEQ2/rData/dds.RData",
installedAnnot=OUTPUTDIR+"/DESEQ2/annotDbInstalled.tmp"
output: expand(OUTPUTDIR+"/DESEQ2/results/{{comp}}/{{comp}}_{file}",file=["gseGo.txt","gseKegg.txt"])
params: cond1=lambda wildcards : getConditionForComp(wildcards)["cond1"],
cond2=lambda wildcards : getConditionForComp(wildcards)["cond2"]
......
## Calcul à partir de la matrice FPKM : https://www.gtexportal.org/home/faq#tpm
#tpm=RPKM / sum(RPKM) * 1e6
fpkm_to_tpm <- function(FPKM) {
TPM=FPKM/sum(FPKM)*1e6
return(TPM)
}
args=commandArgs(trailingOnly=TRUE)
fpkmFile<-args[1]
tpmFile<-args[2]
fpkm=read.table(fpkmFile, header=T, check.names=FALSE)
tpm=apply(fpkm[,2:ncol(fpkm)], 2, fpkm_to_tpm)
tpm=as.data.frame(tpm)
tpm=cbind(as.character(fpkm[,1]), tpm)
names(tpm) = names(fpkm)
write.table(tpm, tpmFile, col.names=TRUE, sep="\t", row.names = FALSE, quote=FALSE)
\ No newline at end of file
#!/usr/bin/env RScript
args=commandArgs(trailingOnly=TRUE)
org=args[1]
corresFile=args[2]
corresIDorg=read.table(corresFile,header=T,row.names=1)
orgGO=as.character(corresIDorg[org,1])
if((!require(orgGO,character.only=TRUE))){
requireNamespace("BiocManager", quietly = TRUE)
BiocManager::install(orgGO, updates = TRUE)
}
......@@ -6,6 +6,7 @@ import csv
import re
import sys
import urllib.request
import errno
from collections import OrderedDict
## Main
......@@ -13,9 +14,9 @@ parser = argparse.ArgumentParser()
parser.add_argument('-s', '--samplesheet', metavar='FILE', help='Tab delimited file with no header describing samples. Columns must be: "name condition". Only characters "A-Z","0-9","-" and "_" allowed. Both columns are mandatory. (REQUIRED)', type=argparse.FileType('rt'), required=True, dest='samplesheet')
parser.add_argument('-w', '--workdir', metavar='DIR', help='Analysis working directory. Default: current directory', default='Results', dest='workdir')
parser.add_argument('-r', '--reference-name', metavar='DIR', help='Reference name. This name must match a key in the CONFIG/references.json file. If not used, you will have to write the reference object yourself in the config.json file', dest='referencename', required=False)
parser.add_argument('-r', '--reference-name', metavar='NAME', help='Reference name. This name must match a key in the CONFIG/references.json file. If not used, you will have to write the reference object yourself in the config.json file', dest='referencename', required=False)
parser.add_argument('-c', '--comparisons', metavar='FILE', help='Tab delimited file with no headers indicating which conditions to compare during differential expression analysis. Columns must be "condition1 condition2". (REQUIRED)', type=argparse.FileType('rt'), required=True, dest='conditions')
parser.add_argument('-t', '--librarytype', choices=['yes','no','reverse'] ,help='Library type. If you have no idea what this is, please see https://chipster.csc.fi/manual/library-type-summary.html',required=True, dest='librarytype')
parser.add_argument('-t', '--librarytype', choices=['yes','no','reverse'] ,help='Library type. If you have no idea what this is, please see "https://chipster.csc.fi/manual/library-type-summary.html"',required=True, dest='librarytype')
parser.add_argument('-l', '--readlength', metavar='N', type=int, help='Length of the reads.', default=100, required=False, dest='readlength')
parser.add_argument('-n', '--projectname', help='Project name which will appear in html report.', default="RNAseq project", required=False, dest='projectname')
parser.add_argument('-p', '--project', metavar='FILE', help='project.json file generated by illuminadir.jar. (REQUIRED)', required=True, dest='projectjson')
......@@ -57,8 +58,8 @@ else:
d["reference"]["gtf"]="</path/to/gtffile.gtf>"
d["reference"]["biomart"]="feb2014.archive.ensembl.org,ENSEMBL_MART_ENSEMBL,hsapiens_gene_ensembl #for example"
alphanum = re.compile('^[\w-]+$')
alphanumStartLetter = re.compile('^[a-zA-Z]+[\w-]+$')
alphanum = re.compile(r'^[\w-]+$')
alphanumStartLetter = re.compile(r'^[a-zA-Z]+[\w-]+$')
# Check project name
if(not alphanum.match(d["project-name"])):
......
......@@ -104,13 +104,6 @@ directory <- c(COUNTDIRECTORY)
conditions<-factor(sampleTable$condition)
uniq_conds <- unique(conditions)
#pw_tests <- list()
#for(i in 1:(length(uniq_conds)-1)) {
# for(j in (i+1):length(uniq_conds)) {
# pw_tests[[length(pw_tests)+1]] <- c(as.character(uniq_conds[j]), as.character(uniq_conds[i]))
# }
#}
##Récupération de la matrice normalisée
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory, design= ~condition)
......@@ -154,7 +147,6 @@ geneNames <-getBM(filters="ensembl_gene_id",
matrix <- merge(matrix,geneNames,by.x="row.names",by.y="ensembl_gene_id",all.x=TRUE)
mrld<-merge(mrld,geneNames,by.x="row.names",by.y="ensembl_gene_id",all.x=TRUE)
matrix$Row.names <- paste(matrix$Row.names, matrix$external_gene_name, sep = "|")
mrld$Row.names <- paste(mrld$Row.names, mrld$external_gene_name, sep = "|")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment