Commit 326a0758 authored by Eric CHARPENTIER's avatar Eric CHARPENTIER 🐍
Browse files

added script to find bad samples in a run

parent caa47e70
#!/usr/bin/env python
from __future__ import print_function
import os
import json
import argparse
import sys
# Prints to stderr
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
## Main
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--rundir', metavar='DIR', help='Analysis working directory.', required=True, dest='rundir')
parser.add_argument('--minGenes', metavar='N', type=int, help='Minimum genes detected necessary for a sample to pass the filtering step in secondary analysis.', required=False, dest='minGenes')
parser.add_argument('--minReads', metavar='N', type=int, help='Minimum reads assigned necessary for a sample to pass the filtering step in secondary analysis.', required=False, dest='minReads')
args = parser.parse_args()
rundir = args.rundir
if not os.path.exists(rundir):
eprint("ERROR getBadSamplesForRun.py: The run directory "+rundir+" does not exist !")
sys.exit()
configFile = os.path.join(rundir, "config_used_in_analysis.json")
if not os.path.isfile(configFile):
eprint("ERROR getBadSamplesForRun.py: Cannot find the configuration file \""+configFile+"\" !")
sys.exit()
config = json.load(open(configFile, "r"))
projects = {s["project"] for s in config["samples"]}
sample2Project = {s["name"]:s["project"] for s in config["samples"]}
print(sample2Project)
mins = dict()
for p in projects:
mins[p] = dict()
if (args.minGenes):
eprint("INFO getBadSamplesForRun.py: Minimum genes specified. Using \""+str(args.minGenes)+"\" as minimum value !")
for p in projects:
mins[p]["minGenes"] = args.minGenes
else:
eprint("INFO getBadSamplesForRun.py: Minimum genes not specified.")
for p in projects:
if(not p in config["comparisons"]):
eprint("INFO getBadSamplesForRun.py: Using default 5000 genes for project \""+p+"\".")
mins[p]["minGenes"] = 5000
else:
eprint("INFO getBadSamplesForRun.py: Using "+str(config["comparisons"][p]["minGenes"])+" genes for project \""+p+"\".")
mins[p]["minGenes"] = config["comparisons"][p]["minGenes"]
if (args.minReads):
eprint("INFO getBadSamplesForRun.py: Minimum UMI specified. Using \""+str(args.minReads)+"\" as minimum value !")
for p in projects:
mins[p]["minReads"] = args.minReads
else:
eprint("INFO getBadSamplesForRun.py: Minimum UMI not specified.")
for p in projects:
if(not p in config["comparisons"]):
eprint("INFO getBadSamplesForRun.py: Using default 200k UMI for project \""+p+"\".")
mins[p]["minReads"] = 200000
else:
eprint("INFO getBadSamplesForRun.py: Using "+str(config["comparisons"][p]["minReads"])+" UMI for project \""+p+"\".")
mins[p]["minReads"] = config["comparisons"][p]["minReads"]
runUnqWellSummary = os.path.join(rundir, "EXPRESSION","run.unq.well_summary.dat")
if not os.path.isfile(runUnqWellSummary):
eprint("ERROR getBadSamplesForRun.py: Cannot find the run.unq.well_summary.dat file \""+runUnqWellSummary+"\" !")
sys.exit()
with open(runUnqWellSummary, 'r') as f:
samples = f.readline().strip("\n").split("\t")
for line in f.readlines():
if (line.startswith("Refseq_UMI")): refseq = line.strip("\n").split("\t")
if (line.startswith("Genes_Detected")): genes = line.strip("\n").split("\t")
badSamples = dict()
count = 0
for i in range(1,len(samples)):
count+=1
if (int(refseq[i])<mins[sample2Project[samples[i]]]["minReads"] or int(genes[i])<mins[sample2Project[samples[i]]]["minGenes"]):
badSamples[samples[i]] = {"refseq_umi":refseq[i],"genes_detected":genes[i]}
print(str(len(badSamples))+" samples found below the threshold")
print("Sample\tRefseq_UMI\tgenes_detected")
for s in badSamples:
print(s+"\t"+str(badSamples[s]["refseq_umi"])+"\t"+str(badSamples[s]["genes_detected"]))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment