Commit 629a9768 authored by A.J. Sethi's avatar A.J. Sethi

clairo index is working until whippet

parent d4fa1da5
......@@ -86,13 +86,13 @@ do
n) # nascent RNA alignments
[ -d ${OPTARG} ] && export nbamDir="${OPTARG}" && nbamList=`ls -d ${OPTARG}/*.*` || die "cannot find any binary alignments in nascent bam directory" # get a list of files in the primary input
nbamCount=$(echo $nbamList | grep -c -e ".bam")
export nbamCount=$(echo $nbamList | grep -c -e ".bam")
[ ${nbamCount} -gt "0" ] || die "no binary alignments found in ${nbamDir}"
;;
c) # cytoplasmic RNA alignments
[ -d ${OPTARG} ] && export cbamDir="${OPTARG}" && cbamList=`ls -d ${OPTARG}/*.*` || die "ccannot find any binary alignments in mature bam directory" # get a list of files in the primary input
cbamCount=$(echo $cbamList | grep -c -e ".bam")
export cbamCount=$(echo $cbamList | grep -c -e ".bam")
[ ${cbamCount} -gt "0" ] || die "no binary alignments found in ${cbamDir}"
;;
......@@ -150,7 +150,7 @@ done
# validate the annotation
[ -f ${myAnnotation} ] && [ -f ${myFasta} ] || die "user supplied invalid annotation"
vsec "your annotation is ${myAnnotation}"
vsec "your reference sequence is ${myReferenceSequence}"
vsec "your reference sequence is ${myFasta}"
# validate the nascent binary alignments
[ "${nbamCount}" -gt "0" ] || die "did not detect any bam files in the nacent alignment directory"
......@@ -184,15 +184,16 @@ export qtc=$(echo $quarter_tc | awk '{print int($1+0.5)}')
#[ ${MODE} == "die" ] && die "user did not define runmode"
ssec "your runmode is ${MODE}"
vsec "completed housekeeping; parsing GTF"
####################################
# generate on the fly whippet index using mature bams
nsec "preparing cytoplasmic alignments for whippet indexing"
# tell bash to die if an undefined variable is used
set -u
# first, merge the mature bams
matureSamMerge="${od}/matureSamMerge"; mkdir -p ${matureSamMerge} 2>/dev/null
export matureSamMerge="${od}/matureSamMerge"; mkdir -p ${matureSamMerge} 2>/dev/null
# merge bams
vsec "preparing to merge mature alignments"
......@@ -203,12 +204,12 @@ cd ${cbamDir} && samtools merge -f -@ ${threadCount} ${matureSamMerge}/merged.ba
fi
# sort and select for unique alignments
vsec "preparing to sort your merged mature alignments"
vsec "generating sorted unique alignments for whippet index"
if [ -f "${matureSamMerge}/unique.sorted.merged.bam" ];
then vsec "previous merged alignment found -- skipping sort step";
then vsec "previous sorted unique alignment found -- skipping sort step";
else
# prepare folders for splitting bam
splitMerged="${matureSamMerge}/splitMerged"
export splitMerged="${matureSamMerge}/splitMerged"
rm -rf ${splitMerged} 2>/dev/null
mkdir -p ${splitMerged}; [ -d ${splitMerged} ] || die "cannot access splitMerged"
......@@ -217,7 +218,7 @@ else
# sort and in parallel, split the bam
vsec "sorting and splitting the merged bam"
samtools sort -T ${matureSamMerge}/ -@ ${threadCount} ${matureSamMerge}/merged.bam | java -jar ${splitbamPath}/splitbam3.jar -o ${splitMerged}/__GROUPID__.bam -m --bamcompression 1 -g "${splitMerged}/targets.txt" || die "cannot split your sorted bam"
samtools sort -l 0 -m 4G -T ${matureSamMerge}/ -@ ${threadCount} ${matureSamMerge}/merged.bam | java -jar ${splitbamPath}/splitbam3.jar -o ${splitMerged}/__GROUPID__.bam -m --bamcompression 1 -g "${splitMerged}/targets.txt" > /dev/null || die "cannot split your sorted bam"
# define a function to remove duplicates from a gievn bam
removeBamDup() {
......@@ -227,8 +228,9 @@ else
# remove duplicates
cd ${splitMerged} || die "1"
samtools rmdup -S ${myBam} - > ${myBam}.unique.bam || die "2"
samtools rmdup -S ${myBam} - > ${myBam%.bam}.unique.bam || die "2"
rm ${myBam} 2>/dev/null || die "3"
rm ${myBam}.bai 2>/dev/null || die "3"
}; export -f removeBamDup
# call the function for each bam
......@@ -243,28 +245,6 @@ else
samtools index -@ ${threadCount} ${matureSamMerge}/unique.sorted.merged.bam || die "cannot index the unique bam "
fi
# remove duplicates from sorted, merged bam
# samtools is poorly optimized
# so we first split the bam into chromosomes
# then remove duplicates
# and finally remerge everything
vsec "preparing to removed duplicates from sorted + merged alignments"
if [ -f "${matureSamMerge}/unique.sorted.merged.bam" ];
then vsec "previous unique alignment found -- skipping deduplication step";
else # proceed to split the bam
samtools rmdup -S ${matureSamMerge}/sorted.merged.bam ${matureSamMerge}/unique.sorted.merged.bam || die "cannot remove duplicates"
fi
nsec "done"
# index the unique bam
vsec "indexing sunique alignments"
if [ -f "${matureSamMerge}/unique.sorted.merged.bam.bai" ];
then vsec "previous index found -- skipping indexing step";
else samtools index -@ ${threadCount} ${matureSamMerge}/unique.sorted.merged.bam || die "cannot index your merged bam"
fi
### testing
### tell me the files we're using for the whippet indexing step
vsec "notifications incoming:"
......@@ -276,14 +256,14 @@ vsec "annotation is ${myAnnotation}"
wptIdx="/g/data/lf10/as7425/SRscan/index-whippet"; mkdir -p ${wptIdx} 2>/dev/null; cd ${wptIdx} || die "cannot access whippet index directory"; rm -rf ${wptIdx}/*
julia ${whippetPath}/whippet-index.jl --fasta "${myFasta}" --bam "${matureSamMerge}/unique.sorted.merged.bam" --gtf "${myAnnotation}" --bam-min-reads 3 --index ${wptIdx} || die "canot make whippet index"
####################################
# generate OTF fastq for each cytoplasmic alignment
nsec "generating sequences for each cytoplasmic alignment"
# make fastq directory
export fqd="${od}/whippetFASTQ"; mkdir -p ${fqd} 2>/dev/null;
[ -d ${fqd}] || die "cannot access fqd"
[ -d ${fqd} ] || die "cannot access fqd"
# define a function, fetchFastq, to fetch the fastq for each alignment in the user-provided cytoplasmic alignment folder
function fetchFastq() {
......@@ -293,13 +273,14 @@ function fetchFastq() {
bamName=${bam%./} # establsh the filename
# sort the file by name so fastq reads are paired
samtools sort -@ 4 -n ${bam} > ${fqd}/${bamName}.bam
samtools sort -l 0 -m 4G -@ 4 -n ${bam} > ${fqd}/${bamName}.bam
# use samtools to extract a fastq fron the sorted bam file
samtools fastq -@ 4 ${fqd}/${bamName}.bam > ${fqd}/${bamName}.bam && rm ${fqd}/${bamName}.bam || die "cannot generate fastq for ${bamName}"
samtools fastq -@ 4 ${fqd}/${bamName}.bam > ${fqd}/${bamName}.fastq && rm ${fqd}/${bamName}.bam || die "cannot generate fastq for ${bamName}"
}
# call function fetchFastq for each cytoplasmic alignment in our cyto folder;
cd ${cbamDir} && ls *bam | parallel -j "${threadCount}" fetchFastq {} || die "cannot write fastqs for each ..."
ssec "done generating fastq for all alignents"
## compute whippet over each fastq
# generate a list of fastqs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment