#!/bin/sh
#$ -S /bin/sh

# --------------------------------
# $1 SRX (例: SRX023509)
# $2 Genome  (例: mm9)
# $3 $projectDir (例: chipome_ver3)
# $4 q-val threshold (例: "05 10 20")

# qsub -N "srT$Genome" -o $Logfile -e $Logfile -pe def_slot $nslot $projectDir/sh/sraTailor.sh $SRX $Genome $projectDir "$QVAL" <g1|g2>
# g1: FastQ ファイルを残す。 g2: FastQ ファイルを再利用する
# --------------------------------

SorP=0
SRX=$1
Genome=$2
projectDir=$3
qVal=$4
signal=$5
LoginID=`whoami`
bn=$SRX
ResDir=$projectDir/results/$Genome/$bn
declare -A size                # -Aでハッシュ宣言 
PastT=`date +%s`
module load singularity

fastqDump=/home/$LoginID/$projectDir/bin/sratoolkit.2.9.4-ubuntu64/bin/fasterq-dump
fasterqDump=/home/$LoginID/$projectDir/bin/sratoolkit.2.9.6-1-ubuntu64/bin/fasterq-dump
bowtie2=/home/$LoginID/$projectDir/bin/bowtie2-2.2.2/bowtie2
samtools=/home/$LoginID/$projectDir/bin/samtools-0.1.19/samtools
# macs2="/usr/local/biotools/m/macs2:2.1.1--r3.2.2_0 macs2"


bedtools=/home/$LoginID/$projectDir/bin/bedtools-2.17.0/bin/bedtools
bedGraphToBigWig=/home/$LoginID/$projectDir/bin/bedGraphToBigWig
bedClip=/home/$LoginID/$projectDir/bin/bedClip
bedToBigBed=/home/$LoginID/$projectDir/bin/bedToBigBed
gsize=/home/$LoginID/$projectDir/lib/genome_size/$Genome.chrom.sizes

logF="$projectDir/results/$Genome/log/$SRX.log.txt"
: > "$logF"
rm -rf $ResDir
mkdir $ResDir
rm -f $projectDir/results/$Genome/summary/$SRX.txt
lfs setstripe -c 8 $ResDir


####################################################################################################
#                                             Fasterq-dump
####################################################################################################
echo -e "Job ID = $JOB_ID"
echo -e "SRX = $SRX"
echo -e "Genome = $Genome"
echo -e "\nsra ファイルのダウンロード中...\n"

runInfo=`cat $projectDir/lib/metadata/SRA_Metadata_RunInfo.tab| awk '$1 == "'$SRX'"'`
SorP=`echo $runInfo| cut -d ' ' -f2`  # SorP=0: SINGLE, 1: PAIRED

if [ "$SorP" = "0" ] ; then
  echo "Read layout: SINGLE"
  optRmdup="-s"
else
  echo "Read layout: PAIRED"
  Split="--split-files"
fi

echo ""
echo -e "\nfastq に変換中...\n"

echo $runInfo| awk 'NF < 3 {print "NO_RUN_INFO"}'
NRI=`cat $logF| grep -c NO_RUN_INFO`
if [ $NRI -gt 0 ]; then
  rm -rf $ResDir
  exit
fi

cd $ResDir
function stopFasterqDump() {  # Fasterq-dump が 24 時間経っても終わらない場合は強制停止
  while :; do
    let cnt=$cnt+1
    logLine=`cat ~/$logF| grep -c "fastq に変換しました"`
    if [ $logLine = 0 -a $cnt = 24 ]; then
      echo "Fasterq-dump が 24 h 以内に終わらないので強制停止。"
      cd
      for i in `seq 10`; do
        rm -rf $ResDir
      done
      qdel $JOB_ID
    fi
    if [ $logLine -gt 0 ]; then
      break
    fi
    sleep 3600
  done
}
stopFasterqDump &

mkdir -p ~/tmpDirForFastq
if [ "$signal" = "g2" ]; then
  mv ~/tmpDirForFastq/$SRX[._]*fq ~/$ResDir/ 2>/dev/null
else
  sraV="2.10.7"
  cmd="cd && mv $ResDir $ResDir"2" && rm -rf $ResDir"2" && qdel $JOB_ID && sleep 10" # 強制終了コマンド。summary は作られない。
  for srr in `echo $runInfo| cut -d ' ' -f3-`; do
    srxS1=`echo $SRX| cut -c1-3`
    srxS2=`echo $SRX| cut -c1-6`
    sraPath="/usr/local/resources/dra/sralite/ByExp/litesra/$srxS1/$srxS2/$SRX/$srr/$srr.sra"
    if [ -f "$sraPath" ]; then
      mkdir -p $srr
      ln -s "$sraPath" $srr/$srr.sra
    else
      while :; do
        ~/chipatlas/bin/sratoolkit."$sraV"-ubuntu64/bin/prefetch --max-size 1000GB $srr 2>&1
        ls $srr/$srr.sra > /dev/null 2>&1 && break
      done
    fi
    while :; do
      ~/chipatlas/bin/sratoolkit."$sraV"-ubuntu64/bin/fastq-dump $Split $srr/$srr.sra 2>&1
      ls "$srr"*fastq > /dev/null 2>&1 && rm -rf $srr && break
    done
  done| awk -v ResDir=$ResDir '
  BEGIN {
    cmd1 = "cd; rm -rf '$ResDir'" 
    cmd2 = "qdel '$JOB_ID'" 
    err1 = " - no data \\( 404 \\)"  # .sra が存在しない (ERX149721)
    err2 = "err: query unauthorized while resolving"  # human sequence が非公開 (SRX3879200)
    err3 = "Cannot resolve accession \\( 404 \\)"  # 原因不明 (ERX1103222)
  } {
    print $0
    if ($0 ~ err1 || $0 ~ err2 || $0 ~ err3) {
      print "sratoolkit のエラーにより強制停止。"
      for (i=1; i<=10; i++) system(cmd1)
      system(cmd2)
    }
  }'
  
  lfq=`ls -1 *.fastq | wc -l`
  ls -1 *.fastq | awk -v Lfq="$lfq" -v BN=$bn -v SORP="$SorP" '{
    if (SORP == 0) {  #SINGLE
      if (Lfq == 1) print "mv " $1 " " BN ".fq"
      else print "cat " $1 " >> " BN ".fq"
    }
    else {   #PAIRED
      if (Lfq == 2) {
        if ($1 ~ /_1.fastq$/) print "mv " $1 " " BN "_1.fq"
        if ($1 ~ /_2.fastq$/) print "mv " $1 " " BN "_2.fq"
      }
      else {
        if ($1 ~ /_1.fastq$/) print "cat " $1 " >> " BN "_1.fq"
        if ($1 ~ /_2.fastq$/) print "cat " $1 " >> " BN "_2.fq"
      }
    }
  }' |/bin/sh
  
  for srr in `echo $runInfo| cut -d ' ' -f3-`; do
    rm -f /home/$LoginID/ncbi/public/sra/$srr.sra.cache
  done
fi

if [ $SorP = "0" ] ; then
  size["fastq"]=`ls -l $bn.fq| awk '{print $5}'`
else
  size["fastq"]=`ls -l $bn"_1.fq"| awk '{print $5}'`
fi

rm -f *fastq
echo -e "\nfastq に変換しました。\n"

####################################################################################################
#                                             Bowtie2
####################################################################################################
echo -e "\nbowtie でマッピング中...\n"

if [ $SorP = "0" ] ; then
  $bowtie2 -p $NSLOTS -t --no-unal -x /home/$LoginID/$projectDir/lib/bowtie_index/$Genome -q $bn.fq -S $bn.sam 2> bowtieReport.txt
else
  $bowtie2 -p $NSLOTS -t --no-unal -x /home/$LoginID/$projectDir/lib/bowtie_index/$Genome -q -1 $bn"_1.fq" -2 $bn"_2.fq" -S $bn.sam 2> bowtieReport.txt
fi
if [ "$signal" = "g1" ]; then
  mv "$bn"*fq ~/tmpDirForFastq/
  ARID=`qstat -j $JOB_ID| awk '$1 == "ar_id:" {printf "-ar " $2}'`
  genome2=`cat ~/$projectDir/sh/preferences.txt| awk '$1 == "Genome"'| cut -f2| tr '\t= ' '\n\n\n'| tr -d ','| grep $Genome| sed "s/$Genome//"`
  if [ `echo $genome2| wc -c` -gt 1 ]; then
    Logfile2="$projectDir/results/$genome2/log/$SRX.log.txt"
    qsub $ARID -N "srT$genome2" -o $Logfile2 -e $Logfile2 -pe def_slot $NSLOTS ~/$projectDir/sh/sraTailor.sh $SRX $genome2 $projectDir "$qVal" g2
  fi
else
  rm -f *.fq
fi

cat bowtieReport.txt
size["Nspots"]=`cat bowtieReport.txt | tr -d '%' | awk '{if ($0 ~ "reads; of these") printf "%d", $1}'`
size["sam"]=`ls -l $bn.sam| awk '{print $5}'`
rm bowtieReport.txt


echo -e "\nマッピングが完了しました。\n"

####################################################################################################
#                                             SamTools
####################################################################################################
echo -e "\nsamtools でBAM に変換中...\n"

$samtools view -@ $NSLOTS -S -b -o $bn.bam_unsrt $bn.sam
rm $bn.sam
$samtools sort -@ $NSLOTS $bn.bam_unsrt $bn.dup
rm $bn.bam_unsrt 
$samtools rmdup $optRmdup $bn.dup.bam $bn.bam

size["NbeforeRmdup"]=`$samtools view -c $bn.dup.bam`
size["NafterRmdup"]=`$samtools view -c $bn.bam`
size["alPercent"]=`echo ${size["NafterRmdup"]} ${size["Nspots"]}| awk '{printf "%f", 100*$1/$2}'`
size["dupPercent"]=`echo ${size["NafterRmdup"]} ${size["NbeforeRmdup"]}| awk '{printf "%f", 100*($2-$1)/$2}'`
size["Bam"]=`ls -l $bn.bam| awk '{print $5}'`
ScaleVal=`echo ${size["NafterRmdup"]} | awk '{printf "%.10f", 1000000/$1}'`
rm $bn.dup.bam
echo -e "\nBAM に変換しました。\n"

####################################################################################################
#                                             Peak-call
####################################################################################################
case "$Genome" in
  mm10 | mm9) macsg=mm;;
  hg38 | hg19) macsg=hs;;
  ce11 | ce10) macsg=ce;;
  dm6 | dm3) macsg=dm;;
  sacCer3) macsg=12100000;;
  rn6) macsg=2.15e9;;  # total genome size (= 2.87e9) の 75%
esac

echo -e "\nBed ファイルを作成中...\n"

MACS() { # $1=Qval
#  export PYTHONPATH=/home/$LoginID/$projectDir/bin/MACS2-2.1.0/$projectDir/bin/MACS2-2.1.0/lib/python2.7/site-packages:$PYTHONPATH
  BN=$bn.$1
  wd="/home/$LoginID/$projectDir/results/$Genome/$SRX/"
  singularity exec /home/$LoginID/$projectDir/bin/macs2:2.1.1--r3.2.2_0 macs2 callpeak -t $wd$bn.bam -f BAM -g $macsg -n $wd$BN -q 1e-$1

#  $macs2 callpeak -t $bn.bam -f BAM -g $macsg -n $BN -q 1e-$1
  cut -d '/' -f1,8 $wd$BN"_"peaks.narrowPeak| tr -d '/'| sort -k1,1 -k2,2n > $wd$BN"_"peaks.unclip   # cut -f1-3,5 は後回し
  $bedClip $wd$BN"_"peaks.unclip $gsize $wd$BN.bed
  awk '{printf "%s\t%s\t%s\t%s\n", $1, $2, $3, $5}' $wd$BN.bed > $wd$BN.bed.tmp
  $bedToBigBed -type=bed4 $wd$BN.bed.tmp $gsize $wd$BN.bb
  rm $wd$BN"_"model.r $wd$BN"_"*.xls $wd$BN"_"peaks.narrowPeak $wd$BN"_"peaks.unclip $wd$BN.bed.tmp
  echo CompletedMACS2peakCalling
}

for i in `echo $qVal`; do
  sleep 30
  MACS $i &
  wn="$wn $!"
done

####################################################################################################
#                                             BedGraph
####################################################################################################
echo -e "\nBedGraph に変換中...\n"
$bedtools genomecov -scale $ScaleVal -ibam $bn.bam -bg -g $gsize| awk -F '\t' '
BEGIN {
  while ((getline < "'$gsize'") > 0) x[$1]++
} x[$1] > 0' > $bn.bg
echo -e "\nBedGraph に変換しました。\n"

size["BedGraph"]=`ls -l $bn.bg| awk '{print $5}'`

####################################################################################################
#                                             BigWig
####################################################################################################
echo -e "\nBigWig に変換中...\n"
$bedGraphToBigWig $bn.bg $gsize $bn.bw
size["BigWig"]=`ls -l $bn.bw| awk '{print $5}'`
rm $bn.bg

echo -e "\nBigWig に変換しました。\n"

QvalNum=`echo $qVal| awk '{print NF}'`
while :; do
  macsN=`grep -c "CompletedMACS2peakCalling" /home/$LoginID/$projectDir/results/$Genome/log/$SRX.log.txt`
  erroN=`grep -c "Since the d (0) calculated" /home/$LoginID/$projectDir/results/$Genome/log/$SRX.log.txt`

  if [ $macsN -eq $QvalNum ]; then    # MACS2 が正常終了
    break
  elif [ $erroN -gt 0 ]; then  # MACS2 が "Since the d (0) calculated" エラーの場合、強制終了
    kill $wn
    break
  fi
done

cat << '==========================' > /dev/null
# まれに変な bed ができることがあるので、その場合は再実行
NabBed=`cat /home/$LoginID/$projectDir/results/$Genome/log/$SRX.log.txt| grep -c "51020_peaks.narrowPeak"`
if [ $NabBed -gt 0 ]; then
  for i in `echo $qVal`; do
    MACS $i
  done
fi
==========================

for i in `echo $qVal`; do
  size["BedP$i"]=`ls -l $bn"."$i".bed"| awk '{print $5}'`
  mv $bn"."$i".bed" /home/$LoginID/$projectDir/results/$Genome/Bed$i/Bed/
  mv $bn"."$i".bb" /home/$LoginID/$projectDir/results/$Genome/Bed$i/BigBed/
  bedidx=$bedidx" BedP"$i
done
rm $bn.bam

mv $bn.bw /home/$LoginID/$projectDir/results/$Genome/BigWig

cd
rm -r $ResDir



CurT=`date +%s`
size["Time"]=`echo "$CurT $PastT" | awk '{printf "%.2f\n", ($1-$2)/60}'`

{
  echo -ne "$SRX\t$SorP"
# for idx in fastq sam Nspots NafterRmdup alPercent dupPercent Bam BedGraph BigWig BedP5 BedP10 BedP20 Time; do
  for idx in fastq sam Nspots NafterRmdup alPercent dupPercent Bam BedGraph BigWig `echo $bedidx` Time; do
    echo -ne "\t${size[$idx]}"
  done
  echo ""
} > $projectDir/results/$Genome/summary/$SRX.txt

exit
# $1 = SRX
# $2 = Single or Paired
# $3 = FastQ サイズ
# $4 = Sam サイズ
# $5 = Nspots
# $6 = NafterRmdup
# $7 = alPercent
# $8 = dupPercent
# $9 = Bam サイズ
# $10= BedGraph サイズ
# $11= BigWig サイズ
# $12= Bed05 サイズ
# $13= Bed10 サイズ
# $14= Bed20 サイズ
# $15= Time


