Commit 1ab964e9 authored by Eric CHARPENTIER's avatar Eric CHARPENTIER 🐍
Browse files

refactored conda envs to create only one

parent ad4c1c77
#!/usr/bin/env Rscript
source("http://bioconductor.org/biocLite.R")
biocLite("DESeq2")
biocLite("limma")
biocLite("ggplot2")
biocLite("ComplexHeatmap")
biocLite("pvclust")
biocLite("fdrtool")
biocLite("ggrepel")
biocLite("GSEABase")
biocLite("GO.db")
biocLite("clusterProfiler")
biocLite("pathview")
biocLite("ReactomePA")
biocLite("fgsea")
biocLite("graphite")
biocLite("biomaRt")
biocLite("genefilter")
biocLite("RColorBrewer")
biocLite("gplots")
\ No newline at end of file
packageList <- c(
"circlize",
"clusterProfiler",
"ComplexHeatmap",
"DESeq2",
"DOSE",
"dplyr",
"fdrtool",
"fgsea",
"ggplot2",
"ggrepel",
"GO.db",
"grid",
"GSEABase",
"limma",
"pvclust",
"biomaRt",
"gplots"
)
packagesToInstall <- setdiff(packageList, installed.packages()[,"Package"])
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager", repos = "https://cloud.r-project.org")
BiocManager::install(packagesToInstall, updates = TRUE)
name: rna
channels:
- ostrokach
- bioconda
- conda-forge
- ostrokach
- defaults
dependencies:
- bcftools=1.9
- cutadapt=1.18=py36_0
- fastqc=0.11.8=0
- htseq=0.9.1=py36h24bf2e0_1
- htslib=1.9=hc238db4_4
- libdeflate=1.0=h470a237_0
- multiqc=1.6=py36h24bf2e0_0
- perl-threaded=5.22.0=13
- prinseq=0.20.4=2
- pysam=0.15.1=py36h0380709_0
- samtools=1.9=h8ee4bcc_1
- snakemake=5.3.0=py36_1
- snakemake-minimal=5.3.0=py36_0
- star=2.6.1b=0
- xopen=0.3.2=py_1
- aioeasywebdav=2.2.0=py36_0
- cachetools=2.1.0=py_0
- colormath=3.0.0=py_2
- filechunkio=1.8=py_2
- ftputil=3.4=py_0
- google-auth=1.5.1=py_0
- google-auth-httplib2=0.0.3=py_2
- google-cloud-core=0.24.1=py36_0
- google-cloud-storage=1.1.1=py36_0
- google-resumable-media=0.0.2=py36_0
- googleapis-common-protos=1.5.5=py_0
- httplib2=0.11.3=py36_1001
- lzstring=1.0.3=py36_0
- prettytable=0.7.2=py_2
- pysftp=0.2.9=py_1
- python-irodsclient=0.7.0=py_0
- ratelimiter=1.2.0=py36_1000
- rsa=3.4.2=py_1
- spectra=0.0.11=py_0
- xmlrunner=1.7.7=py_0
- aiohttp=3.4.4=py36h7b6447c_0
- appdirs=1.4.3=py36h28b3542_0
- asn1crypto=0.24.0=py36_0
- async-timeout=3.0.1=py36_0
- attrs=18.2.0=py36h28b3542_0
- bcrypt=3.1.4=py36h14c3975_0
- blas=1.0=mkl
- boto3=1.9.35=py36_0
- botocore=1.12.35=py36_0
- bz2file=0.98=py36_1
- bzip2=1.0.6=h14c3975_5
- ca-certificates=2018.03.07=0
- cairo=1.14.12=h8948797_3
- certifi=2018.10.15=py36_0
- cffi=1.11.5=py36he75722e_1
- chardet=3.0.4=py36_1
- click=7.0=py36_0
- configargparse=0.13.0=py36_0
- cryptography=2.3.1=py36hc365091_0
- curl=7.61.0=h84994c4_0
- cycler=0.10.0=py36_0
- datrie=0.7.1=py36h7b6447c_1
- dbus=1.13.2=h714fa37_1
- decorator=4.3.0=py36_0
- docutils=0.14=py36_0
- dropbox=9.1.0=py36_0
- expat=2.2.6=he6710b0_0
- fontconfig=2.13.0=h9420a91_0
- freetype=2.9.1=h8a8886c_1
- fribidi=1.0.5=h7b6447c_0
- future=0.17.1=py36_0
- glib=2.56.2=hd408876_0
- graphite2=1.3.12=h23475e2_2
- graphviz=2.40.1=h21bd128_2
- gst-plugins-base=1.14.0=hbbd80ab_1
- gstreamer=1.14.0=hb453b48_1
- harfbuzz=1.8.8=hffaf4a1_0
- icu=58.2=h9c2bf20_1
- idna=2.7=py36_0
- idna_ssl=1.1.0=py36_0
- intel-openmp=2019.0=118
- jinja2=2.10=py36_0
- jmespath=0.9.3=py36_0
- jpeg=9b=h024ee3a_2
- jsonschema=2.6.0=py36_0
- kiwisolver=1.0.1=py36hf484d3e_0
- libcurl=7.61.0=h1ad7b7a_0
- libedit=3.1.20170329=h6b74fdf_2
- libffi=3.2.1=hd88cf55_4
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=8.2.0=hdf63c60_1
- libgfortran-ng=7.3.0=hdf63c60_0
- libpng=1.6.35=hbc83047_0
- libprotobuf=3.6.1=hd408876_0
- libsodium=1.0.16=h1bed415_0
- libssh2=1.8.0=h9cfc8f7_4
- libstdcxx-ng=8.2.0=hdf63c60_1
- libtiff=4.0.9=he85c1e1_2
- libuuid=1.0.3=h1bed415_2
- libxcb=1.13=h1bed415_1
- libxml2=2.9.8=h26e45fe_1
- markdown=3.0.1=py36_0
- markupsafe=1.0=py36h14c3975_1
- matplotlib=3.0.1=py36h5429711_0
- mkl
- mkl_fft=1.0.6=py36h7dd41cf_0
- mkl_random=1.0.1=py36h4414c95_1
- multidict=4.4.2=py36h7b6447c_0
- ncurses=6.1=hf484d3e_0
- networkx=2.0=py36h7e96fb8_0
- numpy=1.15.4=py36h1d66e8a_0
- numpy-base=1.15.4=py36h81de0dd_0
- openjdk=8.0.152=h46b5887_1
- openssl=1.0.2p=h14c3975_0
- pandas=0.23.4=py36h04863e7_0
- pango=1.42.4=h049681c_0
- paramiko=2.4.2=py36_0
- pcre=8.42=h439df22_0
- perl=5.26.2=h14c3975_0
- pigz=2.4=h84994c4_0
- pip=18.1=py36_0
- pixman=0.34.0=hceecf20_3
- protobuf=3.6.1=py36he6710b0_0
- psutil=5.4.8=py36h7b6447c_0
- pyasn1=0.4.4=py36h28b3542_0
- pyasn1-modules=0.2.2=py36_0
- pycparser=2.19=py36_0
- pygraphviz=1.3=py36h14c3975_1
- pynacl=1.3.0=py36h7b6447c_0
- pyopenssl=18.0.0=py36_0
- pyparsing=2.3.0=py36_0
- pyqt=5.9.2=py36h05f1152_2
- pysocks=1.6.8=py36_0
- python=3.6.6=h6e4f718_2
- python-dateutil=2.7.5=py36_0
- pytz=2018.7=py36_0
- pyyaml=3.13=py36h14c3975_0
- qt=5.9.6=h8703b6f_2
- readline=7.0=h7b6447c_5
- requests=2.20.0=py36_0
- s3transfer=0.1.13=py36_0
- setuptools=40.5.0=py36_0
- simplejson=3.16.0=py36h14c3975_0
- sip=4.19.8=py36hf484d3e_0
- six=1.11.0=py36_1
- sqlite=3.25.2=h7b6447c_0
- tk=8.6.8=hbc83047_0
- tornado=5.1.1=py36h7b6447c_0
- urllib3=1.23=py36_0
- wheel=0.32.2=py36_0
- wrapt=1.10.11=py36h14c3975_2
- xz=5.2.4=h14c3975_4
- yaml=0.1.7=had09818_2
- yarl=1.2.6=py36h14c3975_0
- zlib=1.2.11=ha838bed_2
- gzip=1.7=1
- cutadapt=1.18
- fastqc=0.11.8
- htseq=0.9.1
- multiqc=1.6
- numpy=1.14
- pandas=0.23.4
- prinseq=0.20.4
- pysam=0.15.1
- samtools=1.9
- snakemake-minimal>=5.2
- star>=2.6.1b
- openjdk=8.0
- simplejson
- urllib3
- gzip
- wget
- curl
- r-base=3.6
- r-xml
\ No newline at end of file
![](https://img.shields.io/badge/snakemake-limited-orange.svg)
RNAseq quantification :snake:
RNAseq quantification pipeline:snake:
==============================================
This project is an analysis pipeline using **Snakemake** for RNAseq analysis in order to find differentially expressed genes.
It has been widely tested on human RNA sequencing from an Illumina HiSeq but should work on most systems and many other species, provided you download manually the necessary resource files.
......@@ -12,6 +11,7 @@ It has been widely tested on human RNA sequencing from an Illumina HiSeq but sho
**This pipeline is set for paired-end data only from Illumina HiSeq output files.**
The main steps of the pipeline are:
- cleaning data with [prinseq](http://prinseq.sourceforge.net/) and [cutadapt](http://cutadapt.readthedocs.io/en/stable/guide.html)
- alignment of reads on reference genome with [STAR](https://github.com/alexdobin/STAR)
- counting features with [HTSeq](http://htseq.readthedocs.io/)
......@@ -24,35 +24,6 @@ The only requirement is to have a working install of [conda](https://www.anacond
All tools necessary to run the pipeline are described in two conda environment files.
The species specific resources files have to be downloaded manually if not human.
Troubleshooting
---------------
Since conda 4.6, you may have some problems due to the way we source rnaDE environment in Snakefile.
If you get the following error:
```
$HOME/miniconda3/etc/profile.d/conda.sh: line 26: PS1: unbound variable
```
You can apply the solution described at https://github.com/conda/conda/issues/8186 on function `__conda_activate()` and `__conda_reactivate()` in `$HOME/miniconda3/etc/profile.d/conda.sh`.
You can also have this problem with `binutils_linux`, `gcc_linux`, `gfortran_linux`, `gxx_linux`:
```
$HOME/miniconda3/envs/dgeDE/etc/conda/activate.d/activate-binutils_linux-64.sh: line 67: ADDR2LINE : unbound variable
$HOME/miniconda3/envs/dgeDE/etc/conda/activate.d/activate-gcc_linux-64.sh: line 109: SYS_SYSROOT : unbound variable
$HOME/miniconda3/envs/dgeDE/etc/conda/activate.d/activate-gcc_linux-64.sh: line 67: CPPFLAGS : unbound variable
$HOME/miniconda3/envs/dgeDE/etc/conda/activate.d/activate-gfortran_linux-64.sh: line 67: GFORTRAN : unbound variable
$HOME/miniconda3/envs/dgeDE/etc/conda/activate.d/activate-gxx_linux-64.sh: line 67: GFORTRAN : unbound variable
```
You can fix it by adapting the previous trick on the problematic lines (e.g. line 67 `eval oldval="\$${from}$thing"` becomes `eval oldval="\${${from}$thing:-}"` and so on)
Note that if the following error with `CMAKE_PREFIX_PATH_USED` appends in `activate-gcc_linux-64.sh` the corresponding line can be removed.
```
$HOME/miniconda3/envs/rnaDE/etc/conda/activate.d/activate-gcc_linux-64.sh: line 141: CMAKE_PREFIX_PATH_USED : unbound variable
```
Also note that editing thoses files directly will impact all environments using the same version of the package (as conda use hard links when installing an env). Please move the original file as backup file and do modifications on a copy.
### Cloning the repository
```bash
......@@ -81,13 +52,13 @@ source activate rna
Two json configuration files are necessary to run the pipeline:
* `project.json`: contains the description of your samples (name, path of the fastq files etc.). This file is generated by [illuminadir.jar](http://lindenb.github.io/jvarkit/IlluminaDirectory.html) :
- `project.json`: contains the description of your samples (name, path of the fastq files etc.). This file is generated by [illuminadir.jar](http://lindenb.github.io/jvarkit/IlluminaDirectory.html) :
```bash
find /path/to/fastq/files -type f -name "*.fastq.gz" | java -jar scripts/illuminadir.jar -J | python -m json.tool > project.json
```
* `config.json`: contains the parameters of the analysis:
- `config.json`: contains the parameters of the analysis:
| | |
| :--- | :--- |
......@@ -113,7 +84,7 @@ python scripts/make_rna_config.py -h
Two tab separated files are necessary to use this script.
* samplesheet.tsv : tab separated file containing the name of the samples and their condition:
- samplesheet.tsv : tab separated file containing the name of the samples and their condition:
| | |
| :--- | :--- |
......@@ -123,7 +94,7 @@ Two tab separated files are necessary to use this script.
| ... | ... |
| | |
* comparisons.tsv : tab separated file containing the pairs of conditions to compare:
- comparisons.tsv : tab separated file containing the pairs of conditions to compare:
| | |
| :--- | :--- |
......
......@@ -242,12 +242,12 @@ rule htseq:
shell: """
if [ {params.library_type} = "reverse" ]
then
htseq-count -s reverse -f bam {input} {GTF} > {output}
htseq-count -r pos -s reverse -f bam {input} {GTF} > {output}
elif [ {params.library_type} = "yes" ]
then
htseq-count -s yes -f bam {input} {GTF} > {output}
htseq-count -r pos -s yes -f bam {input} {GTF} > {output}
else
htseq-count -s no -f bam {input} {GTF} > {output}
htseq-count -r pos -s no -f bam {input} {GTF} > {output}
fi
"""
......@@ -268,7 +268,6 @@ rule deseq2:
params: outdir = OUTPUTDIR+"/DESEQ2/results",
countsdir = OUTPUTDIR+"/DESEQ2/counts"
shell: """
source activate rnaDE
cat {SCRIPTPATH}/run_deseq2.R | R --slave --args {input.conditions} {params.countsdir} {params.outdir} {BIOMART}
"""
......@@ -278,7 +277,6 @@ rule deg:
params: cond1=lambda wildcards : getConditionForComp(wildcards)["cond1"],
cond2=lambda wildcards : getConditionForComp(wildcards)["cond2"]
shell: """
source activate rnaDE
cat {SCRIPTPATH}/deg.R | R --slave --args {input.rdata} {params.cond1} {params.cond2}
"""
......@@ -290,7 +288,6 @@ rule annot:
params: cond1=lambda wildcards : getConditionForComp(wildcards)["cond1"],
cond2=lambda wildcards : getConditionForComp(wildcards)["cond2"]
shell: """
source activate rnaDE
cat {SCRIPTPATH}/annot.R | R --slave --args {input.rdata} {params.cond1} {params.cond2} {REFNAME} {input.corrAnnotations}
"""
......
......@@ -6,7 +6,6 @@ set -e
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
MAIN_ENV_NAME=rna
DE_ENV_NAME=rnaDE
ENVS=$(conda env list | awk '{print $1}' )
......@@ -25,24 +24,6 @@ if [ ${FOUND} -eq 0 ]; then
else
echo "Creating env ${MAIN_ENV_NAME}"
conda env create -n ${MAIN_ENV_NAME} -f ${DIR}/CONDA/rna.yml
fi
FOUND=1
for ENV in ${ENVS}
do
if [ "${ENV}" == "${DE_ENV_NAME}" ]; then
FOUND=0
fi
done
# Creation of de conda environment.
if [ ${FOUND} -eq 0 ]; then
echo "${DE_ENV_NAME} already created"
else
echo "Creating env ${DE_ENV_NAME}"
conda env create -n ${DE_ENV_NAME} -f ${DIR}/CONDA/rnaDE.yml
source activate ${DE_ENV_NAME}
source activate ${MAIN_ENV_NAME}
Rscript ${DIR}/CONDA/installDeEnv.R
source deactivate
fi
fi
\ No newline at end of file
......@@ -8,7 +8,6 @@ CORRANNOT<-args[5]
library(GO.db)
library(GSEABase)
library(clusterProfiler)
library(pathview)
library(DOSE)
library(fgsea)
......@@ -45,8 +44,16 @@ corresIDorg=read.table(CORRANNOT,header=T,row.names=1)
orgGO=as.character(corresIDorg[ASSEMBLY,1])
orgKegg=as.character(corresIDorg[ASSEMBLY,2])
if((!require(orgGO,character.only=TRUE))){
requireNamespace("BiocManager", quietly = TRUE)
BiocManager::install(orgGO, updates = TRUE)
}
data=read.csv(paste(paste(OUTDIR,comp,comp,sep="/"),"All_DEG.tsv",sep="_"), header=TRUE, sep="\t")
#Sys.setenv(http_proxy="http://cache.ha.univ-nantes.fr:3128")
#Sys.setenv(https_proxy="https://cache.ha.univ-nantes.fr:3128")
if(dim(data)[1]>10){
geneList=makeGeneList(data,orgGO)
gene=names(geneList)
......@@ -57,7 +64,7 @@ if(dim(data)[1]>10){
write.table(ego,paste(paste(OUTDIR,comp,comp,sep="/"),"annotGo.tsv",sep="_"),sep="\t",row.names=F,quote=F)
ekegg=enrichKEGG(gene, organism = as.character(orgKegg), keyType = "kegg", pvalueCutoff = 0.05,pAdjustMethod = "BH", minGSSize = 10, maxGSSize = 500,qvalueCutoff = 0.2, use_internal_data = FALSE)
eKeggSymbol=setReadable(ekegg,orgGO,keytype="ENTREZID")
eKeggSymbol=setReadable(ekegg,orgGO,keyType="ENTREZID")
png(paste(paste(OUTDIR,comp,comp,sep="/"),"dotplotKEGG.png",sep="_"),width=1000,height=600)
print(clusterProfiler::dotplot(eKeggSymbol))
dev.off()
......@@ -68,10 +75,10 @@ dataTot=read.csv(paste(paste(OUTDIR,comp,comp,sep="/"),"DEseqRes.tsv",sep="_"),
geneListTot=makeGeneList(dataTot,orgGO)
ego2=gseGO(geneList=geneListTot,OrgDb=orgGO,ont="ALL",nPerm=1000,minGSSize=20,maxGSSize=500,pvalueCutoff=1,verbose=FALSE)
ego2Symbol=setReadable(ego2,orgGO,keytype="ENTREZID")
ego2Symbol=setReadable(ego2,orgGO,keyType="ENTREZID")
kk2=gseKEGG(geneList=geneListTot,organism=as.character(orgKegg),nPerm=1000,minGSSize=2,pvalueCutoff=1,verbose=FALSE)
kk2Symbol=setReadable(kk2,orgGO,keytype="ENTREZID")
kk2Symbol=setReadable(kk2,orgGO,keyType="ENTREZID")
write.table(ego2Symbol,paste(paste(OUTDIR,comp,comp,sep="/"),"gseGo.txt",sep="_"),sep="\t",row.names=F,quote=F)
write.table(kk2Symbol,paste(paste(OUTDIR,comp,comp,sep="/"),"gseKegg.txt",sep="_"),sep="\t",row.names=F,quote=F)
\ No newline at end of file
write.table(kk2Symbol,paste(paste(OUTDIR,comp,comp,sep="/"),"gseKegg.txt",sep="_"),sep="\t",row.names=F,quote=F)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment