...
 
Commits (2)
......@@ -3,7 +3,7 @@
# written by A.J. Sethi on 2020-10-18
# testing;
# module load clairo; bash ~/ClaiRO/core-w/2020-10-18_makeIndex.sh -a "/g/data/lf10/as7425/genomes/human_genome" -w "/home/150/as7425/.julia/v0.6/Whippet/bin/" -n "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoInput/nascentAlignments" -c "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoInput/cytoplasmicAlignments" -o "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoOutput" -t "48" -v "TRUE"
# module load clairo; bash ~/ClaiRO/core-w/2020-10-18_makeIndex.sh -a "/g/data/lf10/as7425/genomes/human_genome" -w "/home/150/as7425/.julia/v0.6/Whippet/bin/" -n "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoInput/nascentAlignments" -c "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoInput/cytoplasmicAlignments" -o "/g/data/lf10/as7425/2020-10-20_castilloGuzman_analysis/clairoOutput" -j "/g/data/lf10/as7425/apps/jvarkit/dist" -t "48" -v "TRUE"
# submit with;
......@@ -12,6 +12,7 @@
# -n /path/to/folder/with/nascentRNA/bams
# -c /path/to/folder/with/cytoplamic/or/totalRNA/bam
# -w /path/to/whippet/installation
# -j /path/to/jvarkit/dist/splitbam3.jar
# -o /path/to/output/output/directory
# -t /preferred/threadcount
# -m /preferred/runmode # to be implemented
......@@ -69,7 +70,7 @@ do
unset OPTIND
unset OPTARG
while getopts a:n:c:o:t:v:m:w: options
while getopts a:n:c:o:t:v:m:w:j: options
do
case $options in
......@@ -130,6 +131,10 @@ do
export whippetPath="${OPTARG}"
;;
j) # path to jvarkit splitbam3
export splitbamPath="${OPTARG}"
;;
esac;
done
shift $((OPTIND-1))
......@@ -163,10 +168,18 @@ export od=${outputDirectory}
# validate threadCount
ssec "proceeding with ${threadCount} threads"
# also define the quarter threadcount for downstream processing
export quarter_tc=$((${threadCount}/4))
export qtc=$(echo $quarter_tc | awk '{print int($1+0.5)}')
# validate whippet path
[ -z ${whippetPath+x} ] && die "user did not supply whippet path"
[ -f ${whippetPath}/whippet-index.jl ] || die "cannot find whippet in supplied path ${whippetPath}"
# validate jvarkit path
[ -z ${splitbamPath+x} ] && die "user did not supply jvarkit path"
[ -f ${splitbamPath}/splitbam3.jar ] || die "cannot find splitbam3 supplied path ${whippetPath}"
# validate the runmode
#[ ${MODE} == "die" ] && die "user did not define runmode"
ssec "your runmode is ${MODE}"
......@@ -189,12 +202,45 @@ else
cd ${cbamDir} && samtools merge -f -@ ${threadCount} ${matureSamMerge}/merged.bam `ls *.bam` 2>/dev/null || die "cannot merge the cytoplasmic RNA alignments";
fi
# sort the merged bams
# sort and select for unique alignments
vsec "preparing to sort your merged mature alignments"
if [ -f "${matureSamMerge}/sorted.merged.bam" ];
if [ -f "${matureSamMerge}/unique.sorted.merged.bam" ];
then vsec "previous merged alignment found -- skipping sort step";
else
samtools sort -@ ${threadCount} ${matureSamMerge}/merged.bam > ${matureSamMerge}/sorted.merged.bam || die "cannot sort your bam"
# prepare folders for splitting bam
splitMerged="${matureSamMerge}/splitMerged"
rm -rf ${splitMerged} 2>/dev/null
mkdir -p ${splitMerged}; [ -d ${splitMerged} ] || die "cannot access splitMerged"
# fetch chromosome names on the fly from the fasta
paste <(cat ${myAnnotation} | cut -f1 | grep -v '^#' | sort -u) <(cat ${myAnnotation} | cut -f1 | grep -v '^#' | sort -u) -d "\t" > ${splitMerged}/targets.txt
# sort and in parallel, split the bam
vsec "sorting and splitting the merged bam"
samtools sort -T ${matureSamMerge}/ -@ ${threadCount} ${matureSamMerge}/merged.bam | java -jar ${splitbamPath}/splitbam3.jar -o ${splitMerged}/__GROUPID__.bam -m --bamcompression 1 -g "${splitMerged}/targets.txt" || die "cannot split your sorted bam"
# define a function to remove duplicates from a gievn bam
removeBamDup() {
# test that the file is a bam
local myBam=$1
[ -f ${myBam} ] || die "invalid bam $1"
# remove duplicates
cd ${splitMerged} || die "1"
samtools rmdup -S ${myBam} - > ${myBam}.unique.bam || die "2"
rm ${myBam} 2>/dev/null || die "3"
}; export -f removeBamDup
# call the function for each bam
vsec "removing duplicates from each chromosome bam"
cd ${splitMerged} && ls *bam | parallel -j "${qtc}" removeBamDup {} || die "cannot remove duplicates"
# merge the final bams
vsec "merging your unique bams"
cd ${splitMerged} || die "cannot access splitMerged"
samtools merge -f -c -p -@ ${threadCount} ${matureSamMerge}/unique.sorted.merged.bam `ls *.unique.bam` 2>/dev/null || die "cannot merge unique bams"
vsec "indexing your merged bams"
samtools index -@ ${threadCount} ${matureSamMerge}/unique.sorted.merged.bam || die "cannot index the unique bam "
fi
# remove duplicates from sorted, merged bam
......@@ -209,6 +255,8 @@ else # proceed to split the bam
samtools rmdup -S ${matureSamMerge}/sorted.merged.bam ${matureSamMerge}/unique.sorted.merged.bam || die "cannot remove duplicates"
fi
nsec "done"
# index the unique bam
vsec "indexing sunique alignments"
......@@ -225,10 +273,10 @@ vsec "bam is "${matureSamMerge}/unique.sorted.merged.bam""
vsec "annotation is ${myAnnotation}"
# ask whippet to make an index while recognizing splice junctions from our bam
wptIdx="/g/data/lf10/as7425/SRscan/index-whippet"; mkdir ${wptIdx} 2>/dev/null; cd ${wptIdx} || die "cannot access whippet index directory"; rm -rf ${wptIdx}/*
wptIdx="/g/data/lf10/as7425/SRscan/index-whippet"; mkdir -p ${wptIdx} 2>/dev/null; cd ${wptIdx} || die "cannot access whippet index directory"; rm -rf ${wptIdx}/*
julia ${whippetPath}/whippet-index.jl --fasta "${myFasta}" --bam "${matureSamMerge}/unique.sorted.merged.bam" --gtf "${myAnnotation}" --bam-min-reads 3 --index ${wptIdx} || die "canot make whippet index"
exit 0
####################################
# generate OTF fastq for each cytoplasmic alignment
......@@ -259,7 +307,7 @@ targfqs=`ls -d ${fqd}/*.fastq`
# for each fastq, call whippet
# now, seperate the data by strand
cd /dev/shm/exonsByGene/ && ls | parallel -j "${threadCount}" strandSeparateExons {} || die "cannot split your exons by strand"
cd /dev/shm/exonsByGene/ && ls | parallel -j "${qtc}" strandSeparateExons {} || die "cannot split your exons by strand"
ssec "assigned strand to all your exons"
......@@ -278,7 +326,8 @@ awk -v OFS='\t' {'print $1,$2'} "${myFasta}.fai" | sort -V -k1,1 -k2,2 > ${genom
# get scaffolds from the annotation
cut -f1 ${myAnnotation} | grep "^[^#;]" | sort -u | sed '/^$/d' > ${genomeDir}/scaffolds.txt || die "cannot extract scaffolds"
scaffoldCount=$(cat ${genomeDir}/scaffolds.txt | wc -l); [ ${scaffoldCount} -gt "0" ] || die "did not find any scaffolds in your annotation"; ssec "found ${scaffoldCount} scaffolds in your annotation"
scaffoldCount=$(cat ${genomeDir}/scaffolds.txt | wc -l); [ ${scaffoldCount} -gt "0" ] || die "did not find any scaffolds in your annotation"
ssec "found ${scaffoldCount} scaffolds in your annotation"
cp ${myAnnotation} ${genomeDir} || die "cannot backup your annotation"
####################################################################################################
......