Commit bf74b238 authored by A.J. Sethi's avatar A.J. Sethi

whippet index is stable

parent 3865ed8a
......@@ -3,7 +3,7 @@
# written by A.J. Sethi on 2020-10-18
# testing;
# bash ~/ClaiRO/core-w/2020-10-18_makeIndex.sh -a "/g/data/lf10/as7425/genomes/human_genome" -w "/home/150/as7425/.julia/v0.6/Whippet/bin/" -n "/scratch/lf10/as7425/opt/nacsent" -c "/scratch/lf10/as7425/opt/mature" -o "/scratch/lf10/as7425/opt/shrek"
# module load clairo; bash ~/ClaiRO/core-w/2020-10-18_makeIndex.sh -a "/g/data/lf10/as7425/SRscan/genome_dmel-r6.35" -w "/home/150/as7425/.julia/v0.6/Whippet/bin/" -n "/scratch/lf10/as7425/nascent_bam" -c "/scratch/lf10/as7425/cytoplasmic_bam" -o "/scratch/lf10/as7425/clairo_out"
# submit with;
......@@ -19,7 +19,7 @@
####################################
# admin functions
# establish admin functions prior to proceeding
# die function
die() { printf "$(date +%F)\t$(date +%T)\t[scriptDied] characterise-ExoDUs.sh died because it $*\n"; exit 1; }; export -f die
......@@ -35,12 +35,12 @@ vsec() { if [ "${VERBOSE}" == "TRUE" ]; then printf "$(date +%F)\t$(date +%T)\t\
####################################
# housekeeping
# initialize variables prior to parsing arguements
nsec "ClaiRO initiated"
# test that all the requisite packages are avialable in path
# define a function that returns status 1 if a package isn't available
function checkAvailable {
builtin type -P "$1" &> /dev/null
......@@ -59,7 +59,10 @@ export nbamCount=0
export cbamCount=0
export MODE="die"
####################################
# process the input arguments
ARGS=""
while [ $# -gt 0 ]
do
......@@ -72,7 +75,7 @@ do
case $options in
a) # annotation
[ -d ${OPTARG} ] && export input="${OPTARG}" && inputFiles=`ls -d ${OPTARG}/*.*` || die "cannot fetch input files" # get a list of files in the primary input
[ -d ${OPTARG} ] && export input="${OPTARG}" && inputFiles=`ls -d ${OPTARG}/*.*` || die "cannot access files in annotation directory" # get a list of files in the primary input
gunzipCount=$(echo $inputFiles | grep -c -e ".gz"); if [ ${gunzipCount} -gt "0" ]; then ssec "unzipping ${gunzipCount} files in ${1}"; for i in ${1}/*; do gunzip ${i} || die "cannot unzip ${i} " & done; wait; fi # unzip any .gz files if they are present
fastaCount=$(echo $inputFiles | tr " " "\n" | grep -v ".fai" | grep -c -e ".fasta" -e ".fa" -e "fna"); [ ${fastaCount} -eq "1" ] || die "did not identify a single input fasta in ${1}" # check that a fasta of some sort is present
export myFasta="${input}/$(ls ${OPTARG} | tr " " "\n" | grep -v ".fai" | grep -e ".fasta" -e ".fa" -e "fna")"
......@@ -80,14 +83,14 @@ do
export myAnnotation="${input}/$(ls ${OPTARG} | grep -e ".gtf")"
;;
n) # nacent RNA alignments
[ -d ${OPTARG} ] && export nbamDir="${OPTARG}" && nbamList=`ls -d ${OPTARG}/*.*` || die "cannot fetch input files" # get a list of files in the primary input
n) # nascent RNA alignments
[ -d ${OPTARG} ] && export nbamDir="${OPTARG}" && nbamList=`ls -d ${OPTARG}/*.*` || die "cannot find any binary alignments in nascent bam directory" # get a list of files in the primary input
nbamCount=$(echo $nbamList | grep -c -e ".bam")
[ ${nbamCount} -gt "0" ] || die "no binary alignments found in ${nbamDir}"
;;
c) # cytoplasmic RNA alignments
[ -d ${OPTARG} ] && export cbamDir="${OPTARG}" && cbamList=`ls -d ${OPTARG}/*.*` || die "cannot fetch input files" # get a list of files in the primary input
[ -d ${OPTARG} ] && export cbamDir="${OPTARG}" && cbamList=`ls -d ${OPTARG}/*.*` || die "ccannot find any binary alignments in mature bam directory" # get a list of files in the primary input
cbamCount=$(echo $cbamList | grep -c -e ".bam")
[ ${cbamCount} -gt "0" ] || die "no binary alignments found in ${cbamDir}"
;;
......@@ -135,7 +138,9 @@ do
done
### validate the input arguments
####################################
# validate the input arguments
# validate the annotation
[ -f ${myAnnotation} ] && [ -f ${myFasta} ] || die "user supplied invalid annotation"
......@@ -174,21 +179,58 @@ vsec "completed housekeeping; parsing GTF"
# first, merge the mature bams
matureSamMerge="${od}/matureSamMerge"; mkdir -p ${matureSamMerge} 2>/dev/null
cd ${cbamDir} && samtools merge -@ ${threadCount} ${matureSamMerge}/merged.bam `ls *.bam` || die "cannot merge the cytoplasmic RNA alignments"
cd ${cbamDir} && samtools merge -f -@ ${threadCount} ${matureSamMerge}/merged.bam `ls *.bam` 2>/dev/null || die "cannot merge the cytoplasmic RNA alignments"
samtools sort -@ ${threadCount} ${matureSamMerge}/merged.bam > ${matureSamMerge}/sorted.merged.bam || die "cannot sort your bam"
samtools rmdup -S ${matureSamMerge}/sorted.merged.bam ${matureSamMerge}/unique.sorted.merged.bam || die "cannot remove duplicates"
samtools index -@ ${threadCount} ${matureSamMerge}/unique.sorted.merged.bam || die "cannot index your merged bam"
# next, generate on-the-fly whippet index
wptIdx="/g/data/lf10/as7425/SRscan/index-whippet"; mkdir ${wptIdx}
julia ${whippetPath}/whippet-index.jl --fasta ${myReferenceSequence} --bam ${matureSamMerge}/unique.sorted.merged.bam --gtf ${myAnnotation} --bam-min-reads 3 || die "canot make whippet index"
# generate OTF fastq for each cytoplasmic alignment;
ssec "fasta is ${myFasta}"
ssec "bam is "${matureSamMerge}/unique.sorted.merged.bam""
ssec "annotation is ${myAnnotation}"
# ask whippet to make an index while recognizing splice junctions from our bam
wptIdx="/g/data/lf10/as7425/SRscan/index-whippet"; mkdir ${wptIdx} 2>/dev/null; cd ${wptIdx} || die "cannot access whippet index directory"; rm -rf ${wptIdx}/*
julia ${whippetPath}/whippet-index.jl --fasta "${myFasta}" --bam "${matureSamMerge}/unique.sorted.merged.bam" --gtf "${myAnnotation}" --bam-min-reads 3 --index ${wptIdx} || die "canot make whippet index"
exit 0
####################################
# generate OTF fastq for each cytoplasmic alignment
# make fastq directory
export fqd="${od}/whippetFASTQ"; mkdir -p ${fqd} 2>/dev/null;
[ -d ${fqd}] || die "cannot access fqd"
# define a function, fetchFastq, to fetch the fastq for each alignment in the user-provided cytoplasmic alignment folder
function fetchFastq() {
bam=${1}
bamNameTemp=${bam##*/}
bamName=${bam%./} # establsh the filename
# sort the file by name so fastq reads are paired
samtools sort -@ 4 -n ${bam} > ${fqd}/${bamName}.bam
# use samtools to extract a fastq fron the sorted bam file
samtools fastq -@ 4 ${fqd}/${bamName}.bam > ${fqd}/${bamName}.bam && rm ${fqd}/${bamName}.bam || die "cannot generate fastq for ${bamName}"
}
# call function fetchFastq for each cytoplasmic alignment in our cyto folder;
cd ${cbamDir} && ls *bam | parallel -j "${threadCount}" fetchFastq {} || die "cannot write fastqs for each ..."
## compute whippet over each fastq
# generate a list of fastqs
targfqs=`ls -d ${fqd}/*.fastq`
# for each fastq, call whippet
# now, seperate the data by strand
cd /dev/shm/exonsByGene/ && ls | parallel -j "${threadCount}" strandSeparateExons {} || die "cannot split your exons by strand"
ssec "assigned strand to all your exons"
samtools fastq input.bam > output.fastq
exit 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment