#$ -S /bin/bash
#$ -cwd

cd
EXPERIMENT=$1
SRX=$1
REVISION=$2
Genome=$2
projectDir=$3
signal=$4
logF="$projectDir/$Genome.$SRX.log.txt"
: > $logF
genome2=`cat ~/chipatlas/sh/preferences.txt| awk '$1 == "Genome"'| cut -f2| tr '\t= ' '\n\n\n'| tr -d ','| grep $Genome| sed "s/$Genome//"`
Logfile2="$projectDir/$genome2.$SRX.log.txt"

case ${REVISION} in
  hg19) SPECIES="H_sapiens";;
  hg38) SPECIES="H_sapiens";;
  mm9) SPECIES="M_musculus";;
  mm10) SPECIES="M_musculus";;
  rm6) SPECIES="R_norvegicus";;
  ce10) SPECIES="C_elegans";;
  ce11) SPECIES="C_elegans";;
  dm3) SPECIES="D_melanogaster";;
  dm6) SPECIES="D_melanogaster";;
  sacCer3) SPECIES="S_cerevisiae";;
esac

ResDir=$projectDir/results/${REVISION}/$SRX
rm -rf $ResDir
mkdir -p $ResDir
echo -e "Job ID = $JOB_ID"
echo -e "SRX = $SRX"
echo -e "Genome = $Genome"
start=`date +%s`
echo -e "Started at\t`date "+%y%m%d_%T"`"
echo -e "\nsra ファイルのダウンロード中...\n"
runInfo=`cat ~/chipatlas/lib/metadata/SRA_Metadata_RunInfo.tab| awk '$1 == "'$SRX'"'`
SorP=`echo $runInfo| cut -d ' ' -f2`  # SorP=0: SINGLE, 1: PAIRED
if [ "$SorP" = "0" ] ; then
  echo "Read layout: SINGLE"
  optRmdup="-s"
else
  echo "Read layout: PAIRED"
  Split="--split-files"
fi
echo ""
echo -e "\nfastq に変換中...\n"
echo $runInfo| awk 'NF < 3 {print "NO_RUN_INFO"}'
NRI=`cat $logF| grep -c NO_RUN_INFO`
if [ $NRI -gt 0 ]; then
  rm -rf $ResDir
  if [ ! -f $Logfile2 ]; then
    echo "新ゲノム処理が開始しなかった" > $Logfile2
  fi
  exit
fi
cd $ResDir
function stopFasterqDump() {  # Fasterq-dump が 24 時間経っても終わらない場合は強制停止
  while :; do
    let cnt=$cnt+1
    logLine=`cat ~/$logF| grep -c "fastq に変換しました。"`
    if [ $logLine = 0 -a $cnt = 24 ]; then
      echo "Fasterq-dump が 24 h 以内に終わらないので強制停止。"
      cd
      for i in `seq 10`; do
        rm -rf $ResDir
      done
      if [ ! -f $Logfile2 ]; then
        echo "新ゲノム処理が開始しなかった" > $Logfile2
      fi
      qdel $JOB_ID
    fi
    if [ $logLine -gt 0 ]; then
      break
    fi
    sleep 3600
  done
}
stopFasterqDump &
function stopError() {  # core dump / aberrant_fastq / SorP が実際と一致しない場合は強制停止
  while :; do
    logLine1=`cat ~/$logF| grep -c "core dumped"`
    logLine2=`cat ~/$logF| grep -c "Thread creation error"`
    logLine3=`cat ~/$logF| grep -c "Couldn\'t open file"`
    if [ $logLine1 -gt 0 ]; then
      echo "コアダンプのため強制停止。"
      cd
      for i in `seq 10`; do
        rm -rf $ResDir
      done
      if [ ! -f $Logfile2 ]; then
        echo "新ゲノム処理が開始しなかった" > $Logfile2
      fi
      qdel $JOB_ID
    elif [ $logLine2 -gt 0 ]; then
      echo "fastq 異常のため強制停止。"
      cd
      for i in `seq 10`; do
        rm -rf $ResDir
      done
      if [ ! -f $Logfile2 ]; then
        echo "新ゲノム処理が開始しなかった" > $Logfile2
      fi
      qdel $JOB_ID
    elif [ $logLine3 -gt 0 ]; then
      echo "SorP 異常のため強制停止。"
      cd
      for i in `seq 10`; do
        rm -rf $ResDir
      done
      if [ ! -f $Logfile2 ]; then
        echo "新ゲノム処理が開始しなかった" > $Logfile2
      fi
      qdel $JOB_ID
    fi
    sleep 5
  done
}
stopError &
mkdir -p ~/tmpDirForFastq
if [ "$signal" = "g2" ]; then
  mv ~/tmpDirForFastq/$SRX/* ~/$ResDir/ 2>/dev/null
  rm -rf ~/tmpDirForFastq/$SRX
  echo -e "\nfastq に変換しました。\n"
else
  sraV="2.10.7"
  cmd="cd && mv $ResDir $ResDir"2" && rm -rf $ResDir"2" && echo "新ゲノム処理が開始しなかった" > $Logfile2 && qdel $JOB_ID && sleep 10" # 強制終了コマンド。summary は作られない。
  for srr in `echo $runInfo| cut -d ' ' -f3-`; do
    srxS1=`echo $SRX| cut -c1-3`
    srxS2=`echo $SRX| cut -c1-6`
    sraPath="/usr/local/resources/dra/sralite/ByExp/litesra/$srxS1/$srxS2/$SRX/$srr/$srr.sra"
    if [ -f "$sraPath" ]; then
      mkdir -p $srr
      ln -s "$sraPath" $srr/$srr.sra
    else
      while :; do
        /home/okishinya/chipatlas/bin/sratoolkit."$sraV"-ubuntu64/bin/prefetch --max-size 1000GB $srr 2>&1
        ls $srr/$srr.sra > /dev/null 2>&1 && break
      done
    fi
    while :; do
      /home/okishinya/chipatlas/bin/sratoolkit."$sraV"-ubuntu64/bin/fastq-dump $Split $srr/$srr.sra 2>&1
      ls "$srr"*fastq > /dev/null 2>&1 && rm -rf $srr && break
    done
  done| awk -v ResDir=$ResDir -v Logfile2=$Logfile2 '
  BEGIN {
    cmd1 = "cd; rm -rf '$ResDir'" 
    cmd2 = "qdel '$JOB_ID'" 
    err1 = " - no data \\( 404 \\)"  # .sra が存在しない (ERX149721)
    err2 = "err: query unauthorized while resolving"  # human sequence が非公開 (SRX3879200)
    err3 = "Cannot resolve accession \\( 404 \\)"  # 原因不明 (ERX1103222)
  } {
    print $0
    if ($0 ~ err1 || $0 ~ err2 || $0 ~ err3) {
      print "sratoolkit のエラーにより強制停止。"
      print "新ゲノム処理が開始しなかった" > Logfile2
      for (i=1; i<=10; i++) system(cmd1)
      system(cmd2)
    }
  }'
  for srr in `echo $runInfo| cut -d ' ' -f3-`; do
    rm -f /home/$LoginID/ncbi/public/sra/$srr.sra.cache
  done
  if [ $SorP = "0" ] ; then
    size["fastq"]=`ls -l *fastq| awk '{x += $5} END {print x}'`
  else
    size["fastq"]=`ls -l *1.fastq| awk '{x += $5} END {print x}'`
  fi
  for fq in `ls *fastq`; do
    pigz -f -p $NSLOTS $fq
  done
  echo -e "\nfastq に変換しました。\n"
fi
ls -lt | grep "fastq.gz"
# mapping
RUN_OPTION=`ls *fastq.gz| tr '.' '_'| cut -d_ -f1| uniq| awk -v SorP=$SorP '{
  if (SorP == 0) printf " -fastq " $1 ".fastq.gz"
  if (SorP == 1) printf " -pfastq " $1 "_1.fastq.gz " $1 "_2.fastq.gz"
}'`
~/chipatlas/bin/BMap -species ${SPECIES} -revision ${REVISION} -SCF 2 -SCL 32 -thread $NSLOTS ${RUN_OPTION} -sum ${EXPERIMENT}.sum.gz -bisulalign ${EXPERIMENT}.bisulalign > temp 2>&1
cat temp | grep -v "chr" | grep -v "LAMBDA" | grep -v "reads processed"
rm temp
# 新旧ゲノム
if [ "$signal" = "g1" ]; then
  mkdir ~/tmpDirForFastq/$SRX
  mv *fastq.gz ~/tmpDirForFastq/$SRX/
  ARID=`qstat -j $JOB_ID| awk '$1 == "ar_id:" {printf "-ar " $2}'`
  genome2=`cat ~/chipatlas/sh/preferences.txt| awk '$1 == "Genome"'| cut -f2| tr '\t= ' '\n\n\n'| tr -d ','| grep $Genome| sed "s/$Genome//"`
  cd
  if [ `echo $genome2| wc -c` -gt 1 ]; then
    Logfile2="$projectDir/$genome2.$SRX.log.txt"
    qsub $ARID -N "srT$genome2" -o $Logfile2 -e $Logfile2 -pe def_slot $NSLOTS ~/chipatlas/sh/bmap4chipatlas_210329.sh $SRX $genome2 $projectDir g2
  fi
  cd $ResDir
else
  rm *fastq.gz
fi
pigz -f -p $NSLOTS ${EXPERIMENT}.bisulalign
# .bisulalign.gz の check
if [ ! -f ./${EXPERIMENT}.bisulalign.gz ] || [ ! -f ./${EXPERIMENT}.sum.gz ]; then
  echo -e "=-=-=BMap\tBad"
  rm -rf $ResDir
  exit
else
  echo -e "=-=-=BMap\tOK"
fi
ls -lt | grep ".gz"
# bmap_sum
for ((i=1;i<=5;i++)); do 
  xvfb-run -d ~/chipatlas/sh/bmap_sum_210107.sh ${EXPERIMENT} ${REVISION} ${SPECIES} ${NSLOTS}
  sleep 2
  if [ -f ./${EXPERIMENT}.cpg.cover.bedGraph.gz ] && [ -f ./${EXPERIMENT}.cpg.methyl.bedGraph.gz ]; then
    echo -e "=-=-=bmap_sum\tOK"
    break
  else
    echo "=-=-=bmap_sum failed; retry"
    sleep 2
  fi
done
if [ ! -f ./${EXPERIMENT}.cpg.cover.bedGraph.gz ] || [ ! -f ./${EXPERIMENT}.cpg.methyl.bedGraph.gz ]; then
  echo -e "=-=-=bmap_sum\tbad"
fi
# hypo- hyper- partially methylated region の作成
gunzip -c ${EXPERIMENT}.cpg.methpipe.tab.gz | grep -v "LAMBDA" | sort -k 1,1 -k 2,2n -k 3,3n -k 6,6 > ${EXPERIMENT}.cpg.methpipe.sorted.tab
rm *.cpg.methpipe.tab.gz
if [ -s ./${EXPERIMENT}.cpg.methpipe.sorted.tab ]; then
  ~/chipatlas/bin/hmr -o ${EXPERIMENT}.hmr ${EXPERIMENT}.cpg.methpipe.sorted.tab
  ~/chipatlas/bin/pmd -o ${EXPERIMENT}.pmd ${EXPERIMENT}.cpg.methpipe.sorted.tab
  ~/chipatlas/bin/hypermr -o ${EXPERIMENT}.hypermr ${EXPERIMENT}.cpg.methpipe.sorted.tab
  cut -f1-3 ${EXPERIMENT}.hmr > temp
  mv temp ${EXPERIMENT}.hmr
  cut -f1-3 ${EXPERIMENT}.pmd > temp
  mv temp ${EXPERIMENT}.pmd
  cut -f1-3 ${EXPERIMENT}.hypermr > temp
  mv temp ${EXPERIMENT}.hypermr
else
  touch ${EXPERIMENT}.hmr
  touch ${EXPERIMENT}.pmd
  touch ${EXPERIMENT}.hypermr
fi
# .hmr .pmd .hypermr の check
if [ ! -f ./${EXPERIMENT}.hmr ] || [ ! -f ./${EXPERIMENT}.pmd ] || [ ! -f ./${EXPERIMENT}.hypermr ]; then
  echo -e "=-=-=hypo- hyper-me-region\tBad"
  rm -rf $ResDir
  exit
else
  echo -e "=-=-=hypo- hyper-me-region\tOK"
fi
ls -lt | grep ".hmr"
ls -lt | grep ".pmd"
ls -lt | grep ".hypermr"
rm *cpg.methpipe.sorted.tab
# BigWig の作成
gunzip -c ${EXPERIMENT}.cpg.methyl.bedGraph.gz | grep -v "LAMBDA" | awk -F"\t" -v OFS="\t" '$4>0 {print $1,$2,$3+1,$4}'> ${EXPERIMENT}.cpg.methyl.bedGraph
rm *.cpg.methyl.bedGraph.gz
/home/okishinya/bin/qsortBed -t tmp_qsortBed ${EXPERIMENT}.cpg.methyl.bedGraph > ${EXPERIMENT}.cpg.methyl.sorted.bedGraph
rm *.cpg.methyl.bedGraph
/home/okishinya/bin/bedGraphToBigWig ${EXPERIMENT}.cpg.methyl.sorted.bedGraph ~/chipatlas/lib/genome_size/${REVISION}.chrom.sizes ${EXPERIMENT}.cpg.methyl.bw
rm *.cpg.methyl.sorted.bedGraph
gunzip -c ${EXPERIMENT}.cpg.cover.bedGraph.gz | grep -v "LAMBDA" | awk -F"\t" -v OFS="\t" '$4>0 {print $1,$2,$3+1,$4}'> ${EXPERIMENT}.cpg.cover.bedGraph
rm *.cpg.cover.bedGraph.gz
/home/okishinya/bin/qsortBed -t tmp_qsortBed ${EXPERIMENT}.cpg.cover.bedGraph > ${EXPERIMENT}.cpg.cover.sorted.bedGraph
rm *.cpg.cover.bedGraph
/home/okishinya/bin/bedGraphToBigWig ${EXPERIMENT}.cpg.cover.sorted.bedGraph ~/chipatlas/lib/genome_size/${REVISION}.chrom.sizes ${EXPERIMENT}.cpg.cover.bw
# .cpg.cover.bw .cpg.methyl.bw の check
if [ ! -f ./${EXPERIMENT}.cpg.cover.bw ] || [ ! -f ./${EXPERIMENT}.cpg.methyl.bw ]; then
  echo -e "=-=-=BigWig\tBad"
  rm -rf $ResDir
  exit
else
  echo -e "=-=-=BigWig\tOK"
fi
ls -lt | grep ".bw"

# 統計量まとめ
echo ""
for stats in cg mapping_rate_1 mapping_rate_2 num_reads methyl_rate_cpg_total methyl_rate_cpg_lambda low_methyl_region partial_methyl_region high_methyl_region coverage; do
  unset `echo $stats`
done
case "${REVISION}" in
  "mm9" ) cg=43445914 ;;
  "mm10" ) cg=43816016 ;;
  "hg19" ) cg=57400172 ;;
  "hg38" ) cg=61959486 ;;
  "rm6") cg=53698106 ;;
  "ce10") cg=6263038 ;;
  "ce11") cg=6263050 ;;
  "dm3") cg=13300958 ;;
  "dm6") cg=11787346 ;;
  "sacCer3") cg=710598 ;;
esac

# mapping 率
eval $(cat ./${EXPERIMENT}.mapsum.html | grep -A 6 "<tbody>" | awk -F"<td>" '{gsub ("</td>",""); print $2}' | awk '$0 != "" {printf "%s\t", $0}' | awk -F"\t" '{
  if ($2 == 0) {
    gsub (/ \(/,"\t")
    gsub (/\)/,"")
    gsub ("%","")
    printf ("mapping_rate_1=%s\tmapping_rate_2=%s", 1-($4/100), 1-($6/100))
  }
  else {
    gsub (/ \(/,"\t")
    gsub (/\)/,"")
    gsub ("%","")
    printf ("mapping_rate_1=%s\tmapping_rate_2=%s", 1-($3/100), "N/A")
  }
}')
eval $(cat ./${EXPERIMENT}.mapsum.html | grep -A 6 "<tfoot>" | awk -F"<td>" '{gsub ("</td>",""); print $2}' | awk '$0 != "" {gsub (/\(100%\)/, ""); gsub (",",""); printf "%s\t", $0}' | awk -F"\t" '{
  if ($2 == 0) {
    printf ("num_reads=%s", $3)
  }
  else {
    printf ("num_reads=%s", $2)
  }
}')
rm *.mapsum.html
echo -e "=-=-=num_reads\t"${num_reads}
echo -e "=-=-=mapping_rate_1\t"${mapping_rate_1}
echo -e "=-=-=mapping_rate_2\t"${mapping_rate_2}
# read 数，total メチル化率，LAMBDA メチル化率
eval $(cat ./${EXPERIMENT}.stat.html | grep -A 20 "<td>all</td>" | grep "<td align=\"right\">" | awk -F"<td align=\"right\">" '{gsub ("</td>",""); gsub (",",""); printf "%s\t", $2}' | awk -F"\t" '{
  if ($18 != "-") printf ("methyl_rate_cpg_total=%s", $18/100)
  else printf ("methyl_rate_cpg_total=%s", $18)
}')
eval $(cat ./${EXPERIMENT}.stat.html | grep -A 20 "<td>LAMBDA</td>" | grep "<td align=\"right\">" | awk -F"<td align=\"right\">" '{gsub ("</td>",""); gsub (",",""); printf "%s\t", $2}' | awk -F"\t" '{
  if ($18 != "-") printf ("methyl_rate_cpg_lambda=%s", $18/100)
  else printf ("methyl_rate_cpg_lambda=%s", $18)
}')
rm *.stat.html
echo -e "=-=-=methyl_rate_cpg_total\t"${methyl_rate_cpg_total}
echo -e "=-=-=methyl_rate_cpg_lambda\t"${methyl_rate_cpg_lambda}
# hypo-, partial, hyper-methylated region 数
low_methyl_region=$(wc -l ./${EXPERIMENT}.hmr | cut -d" " -f1)
partial_methyl_region=$(wc -l ./${EXPERIMENT}.pmd | cut -d" " -f1)
high_methyl_region=$(wc -l ./${EXPERIMENT}.hypermr | cut -d" " -f1)
echo -e "=-=-=low_methyl_region\t"${low_methyl_region}
echo -e "=-=-=partial_methyl_region\t"${partial_methyl_region}
echo -e "=-=-=high_methyl_region\t"${high_methyl_region}
# カバー率 (cover_integral/cg)
eval $(cat ./${EXPERIMENT}.cpg.cover.sorted.bedGraph | awk -F"\t" -v cg=$cg '
BEGIN {
  cover_integral = 0
} {
    cover_integral += $4
}
END {
  printf ("cover_integral=%s\tcoverage=%s", cover_integral, cover_integral / cg)
}')
rm *.cpg.cover.sorted.bedGraph
echo -e "=-=-=coverage\t"${coverage}
end=`date +%s`
duration=`echo "scale=3; ($end-$start)/3600" | bc`
echo -e "=-=-=end\t`date "+%y%m%d_%T"`"
echo -e "=-=-=duration\t"${duration}
exit

