#!/bin/sh
#$ -S /bin/sh
# 全ての ChIP + Open chromatin の BED をソートする
# qsub -e makePeakBrowser.log.txt -o makePeakBrowser.log.txt chipatlas/sh/makePeakBrowser/makePeakBrowser.sh 600

splitN=$1
: > makePeakBrowser.log.txt

# Curation 済みの ag_Statistics-hg19-tab.tsv を ag_Statistics-hg38-tab.tsv にコピー
for org in `cat chipatlas/sh/preferences.txt| awk '$1 == "Genome"'| cut -f2| tr ' ' '\n'| cut -d = -f1| grep ,`; do
  ls chipatlas/classification/*Statistics-*-tab.tsv| awk '
  BEGIN {
    split("'$org'", a, ",")
  } $1 ~ a[1] {
    printf "cat " $1
    sub(a[1], a[2], $1)
    print " > " $1
  }'
done| sh

# ag_Statistics-hg19-tab.tsv を ag_Index.hg19.tab に変形
for TSV in `ls chipatlas/classification/*Statistics-*-tab.tsv`; do
  outIndex=`echo $TSV| sed 's/Statistics-/Index./'| sed 's/-tab.tsv/.tab/'`
  tail -n +2 $TSV| awk -F '\t' '$4 != $5 {
    gsub(/?[0-9]/, "", $5)
    print $4 "\t" $5
  }' > $outIndex
done
  # input|NA	InP@ Input
  # H3K9me3|AB8898	His@ H3K9me3
  # anti-H3 (Abcam 1791)	His@ H3

# PeakBrowser 用の metadata を作成
GENOME=`ls chipatlas/results`
ls chipatlas/lib/metadata/NCBI_SRA_Metadata_Full_20*| tail -n1| xargs cat| awk -F '\t' -v GENOME="$GENOME" '
BEGIN {
  for (i=1; i<=split(GENOME, g, " "); i++) {
    genome = g[i]
    fn1 = "chipatlas/classification/ag_Index."genome".tab"
    fn2 = "chipatlas/classification/ct_Index."genome".tab"
    fn3 = "chipatlas/results/"genome"/tag/allTags.txt"
    while ((getline < fn1) > 0) a[genome, $1] = $2 
    while ((getline < fn2) > 0) l[genome, $1] = $2
    while ((getline < fn3) > 0) {
      A[$1] = a[genome, $5]
      C[$1] = l[genome, $7]
      G[$1] = genome
    }
  }
} A[$1] {
  printf "%s", $1 "\t" G[$1] "\t" A[$1] "\t" C[$1] "\t" $3 "\t" $4
  for (i=18; i<=NF; i++) printf "\t%s", $i
  printf "\n"
}'| awk -F '\t' '
function symbolSub(Str,underScore) {
  gsub("%", "%25", Str)   # IGV で表示できない文字 %+;=" space を URL エンコーディング に変換
  gsub("+", "%2B", Str)
  gsub(";", "%3B", Str)
  gsub("=", "%3D", Str)
  gsub("\"", "%22", Str)
  gsub(" ", "%20", Str)
  if (underScore == "_") gsub(underScore, "%20", Str)
  return Str
}
BEGIN {
  while ((getline < "chipatlas/sh/abbreviationList_AG.tab") > 0) AGL[$1] = $2
  while ((getline < "chipatlas/sh/abbreviationList_CT.tab") > 0) CTL[$1] = $2
} {
  SRX = $1
  Title[SRX] = symbolSub($5,"_")
  sub("xxx", "NA", Title[SRX])
  
  ag = substr($3, 6)
  ct = substr($4, 6)
  
  if (substr($4, 1, 3) == "Unc") 
    cellGroup = "NA"  # Unc の場合
  else
    cellGroup = symbolSub(CTL[substr($4, 1, 3)],"_")
  fi
  
  for (i=7; i<=NF; i++) {
    str = $i
    sub("=", SUBSEP, str)
    str = symbolSub(str)
    split(str, arr, SUBSEP)
    Atrb[SRX] = Atrb[SRX] arr[1] "=" arr[2] ";"
  }
  printf "%s", $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t"
  printf "ID=%s;", SRX
  printf "Name=%s%20", symbolSub(ag,"_")
  printf "(@%20"
  printf "%s", symbolSub(ct,"_")
  printf ");"
  printf "Title=%s;", Title[SRX]
  printf "%s%s;", "Cell%20group=", cellGroup
  printf "<br>%s", Atrb[SRX]
  for (i=7; i<=NF; i++) printf "\t%s", $i
  printf "\n"
}' > chipatlas/lib/metadata/metadataForPeakBrowser.tsv

#  1  DRX000201                       DRX000203                       # SRX
#  2  mm9                             mm9                             # Genome (mm9 と mm10 の重複なし)
#  3  Unc@ Unclassified               Unc@ Unclassified               # 抗原
#  4  Myo@ C2C12                      Myo@ C2C12                      # 細胞
#  5  C2C12_Growth_H3.3               C2C12_Chd2mir3139_Growth_H3.3   # タイトル (GSM 含む)
#  6  ChIP-Seq                        ChIP-Seq                        # Seq タイプ
#  7  ID=DRX000201;Name=Unclassified  ID=DRX000203;Name=Unclassified  # Peak Browser のメタデータ
#  8  sample_name=DRS000203           sample_name=DRS000204           # 以降、オリジナルのメタデータ
#  9  strain=C2C12                    strain=C2C12                    
# 10  sample comment=source: C2C12 c  sample comment=source:C2C12 ce  
# 11  cell type=myoblast cells        cell type=myoblast cells        

ls tmpDirForPeakBrowser && mv tmpDirForPeakBrowser tmpDirForPeakBrowser_old && rm -rf tmpDirForPeakBrowser_old &
mkdir -p tmpDirForPeakBrowser/splitByChr
mkdir -p tmpDirForPeakBrowser/sortByChr
mkdir -p tmpDirForPeakBrowser/sortedBed
mkdir -p tmpDirForPeakBrowser/splitByAttributes

# 全ての BED ファイルを chr ごとに分割する
for genome in `ls chipatlas/results`; do
  for qval in 05 10 20; do
    mkdir -p tmpDirForPeakBrowser/splitByChr/$genome/$qval
    echo chipatlas/results/$genome/Bed$qval/Bed/*bed
  done
  for mr in hmr hypermr pmd; do
    mkdir -p tmpDirForPeakBrowser/splitByChr/$genome/bmap
    echo chipatlas/results/$genome/bmap/$mr/Bed/*.bed
  done
done| tr ' ' '\n'| awk -F '[/.]' '{
  if ($4 == "bmap") print "sh chipatlas/sh/makePeakBrowser/splitByChr.sh", $0, $3, $5, $7, "bmap"
  else              print "sh chipatlas/sh/makePeakBrowser/splitByChr.sh", $0, $3, $7, $6, "chip"
}'| bin/splitQsub -p $splitN -j splitByChr -t tmp/splitByChr
~/bin/wait4qsub splitByChr # 31 min / 600 threads, 130 min / 100 threads


# 全ての BED ファイルを ID ごとにソートする
for genome in `ls chipatlas/results`; do
  for type in 05 10 20 bmap; do
    echo tmpDirForPeakBrowser/splitByChr/$genome/$type/*/*| tr ' ' '\n'
  done
done| tr ' ' '\n'| awk '{
  print "sh chipatlas/sh/makePeakBrowser/sortById.sh", $1
}'| bin/splitQsub -p $splitN -j sortById -t tmp/sortById
~/bin/wait4qsub sortById # 14 min / 600 threads, 9 min / 100 threads


# 全ての BED ファイルを chr ごとにソートする
for genome in `ls chipatlas/results`; do
  for type in 05 10 20 bmap; do
    for chr in `cut -f1 chipatlas/lib/genome_size/$genome.chrom.sizes`; do
      echo "sh chipatlas/sh/makePeakBrowser/sortByChr.sh $genome $type $chr"
    done
  done
done| bin/splitQsub -p $splitN -j sortByChr -t tmp/sortByChr
~/bin/wait4qsub sortByChr # 5 min / 600 threads, 8 min / 100 threads


# 全ての chr ファイルを連結する
for genome in `ls chipatlas/results`; do
  for type in 05 10 20 bmap; do
    qsub -e /dev/null -o /dev/null -N sortAll chipatlas/sh/makePeakBrowser/sortAll.sh $genome $type
  done
done

# temp ファイルの削除
for genome in `ls chipatlas/results`; do
  for type in 05 10 20 bmap; do
    echo tmpDirForPeakBrowser/splitByChr/$genome/$type/*
  done
done| tr ' ' '\n'| awk '{
  print "rm -rf " $0
}'| bin/splitQsub -p $splitN -j delTemp -t tmp/delTemp

~/bin/wait4qsub sortAll # 5 min / 600 threads, 5 min / 100 threads
~/bin/wait4qsub delTemp # 1 min / 600 threads, 1 min / 100 threads

# 不要なファイルの削除
rm -rf tmpDirForPeakBrowser/sortByChr
rm -rf tmpDirForPeakBrowser/splitByChr

rm -rf tmp/splitByChr
rm -rf tmp/sortById
rm -rf tmp/sortByChr
rm -rf tmp/delTemp

# bmap を bs というファイル名に変更
echo tmpDirForPeakBrowser/sortedBed/*/*bmap.bed| tr ' ' '\n'| awk '{
  printf "mv " $1
  sub("bmap", "bs", $1)
  print " " $1
}'| sh


# allPeaks_light の作成、insilicoChIP のためのファイルの作成、summary/allLineNum.txt の作成
rm -rf tmpDirForPeakBrowser/allPeaks_light
rm -rf tmpDirForPeakBrowser/inSilicoChIP
rm -rf tmpDirForPeakBrowser/linNum

mkdir -p tmpDirForPeakBrowser/linNum
for genome in `ls chipatlas/results`; do
  mkdir -p tmpDirForPeakBrowser/allPeaks_light/$genome
  mkdir -p tmpDirForPeakBrowser/inSilicoChIP/$genome
  for qval in 05 10 20 50 bs; do
    qsub -e /dev/null -o /dev/null chipatlas/sh/makePeakBrowser/allPeaks_light.sh $genome $qval
  done
done


# 抗原大・細胞大で split
rm -rf tmpDirForPeakBrowser/splitByAttributes
rm -rf tmpDirForPeakBrowser/public
rm -rf tmpDirForPeakBrowser/bedList
for genome in `ls chipatlas/results`; do
  mkdir -p tmpDirForPeakBrowser/splitByAttributes/$genome
  mkdir -p tmpDirForPeakBrowser/public/$genome
  mkdir -p tmpDirForPeakBrowser/bedList/$genome
done

GENOME=`ls -1 chipatlas/results| tr '\n' ' '`
cat chipatlas/lib/metadata/metadataForPeakBrowser.tsv| awk -F '\t' -v GENOME="$GENOME" '{
  gsub(/[0-9]/, "", $2)
  g[$2]++
  A[substr($3,1,3)]++
  C[substr($4,1,3)]++
  C["xxx"]++
  a[$2, substr($3,1,3)]++
  c[$2, substr($4,1,3)]++
  c[$2, "xxx"]++
} END {
  for (i=1; i<=split(GENOME, x, " "); i++) G[x[i]]++
  for (i=1; i<=split("05 10 20 50", y, " "); i++) Q[y[i]]++
  for (genome in G) {
    gen = genome
    gsub(/[0-9]/, "", gen)
    for (qval in Q) for (ag in A) for (ct in C) {
      if (a[gen, ag] > 0 && c[gen, ct] > 0 && ag != "BSF") {
        print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/sortedBed/"genome"/"genome"."qval".bed "genome" "ag" "ct" "qval" xxx xxx"
      }
    }
    for (ct in C) {
      if (c[gen, ct] > 0) {
        print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/sortedBed/"genome"/"genome".bs.bed "genome" BSF "ct" bs xxx xxx"
      }
    }
  }
}'| bin/splitQsub -p $splitN -j splitLxL -t tmp/splitLxL -k "-l s_vmem=16G,mem_req=16G"

~/bin/wait4qsub splitLxL # 3.3 hrs / 600 threads


# 抗原小・細胞小で split
ls -l tmpDirForPeakBrowser/splitByAttributes/*/*bed| awk 'NF > 4 && $5 > 0 {
  print $NF
}'| tr '/.' '\t\t'| awk -F '\t' -v OFS='\t' '$(NF-4) != "ALL" {
  print $(NF-6), $(NF-5), $(NF-4), $(NF-3)
}'| awk -F '\t' -v OFS='\t' '
BEGIN {
  while ((getline < "chipatlas/lib/metadata/metadataForPeakBrowser.tsv") > 0) {
    gen = $2
    gsub(/[0-9]/, "", gen)
    agL = substr($3,1,3)
    ctL = substr($4,1,3)
    agS = substr($3,6)
    ctS = substr($4,6)
    if (!ww[gen, agL, ctL, agS]++) a[gen, agL, ctL] = a[gen, agL, ctL] SUBSEP agS
    if (!xx[gen, agL, ctL, ctS]++) c[gen, agL, ctL] = c[gen, agL, ctL] SUBSEP ctS
  }
  for (j=1; j<=split("hypermr hmr pmd", b, " "); j++) B[b[j]]++
} {
  genome = $1
  gen = $1
  gsub(/[0-9]/, "", gen)
  agL = $2
  ctL = $3
  qvl = $4
  N = split(a[gen, agL, ctL], A, SUBSEP)
  if (agL == "BSF") {
    for (mr in B) print genome, qvl, agL, "xxx", mr, "xxx"
    for (mr in B) print genome, qvl, agL, ctL, mr, "xxx"
  } else {
    for (i=2; i<=N; i++) print genome, qvl, agL, "xxx", A[i], "xxx"
    for (i=2; i<=N; i++) print genome, qvl, agL, ctL, A[i], "xxx"
  }
  delete A
  N = split(c[gen, agL, ctL], C, SUBSEP)
  for (i=2; i<=N; i++) print genome, qvl, agL, ctL, "xxx", C[i]
  delete C
}'| awk '!a[$0]++'| awk -F '\t' -v OFS='\t' '{
  Lc = ($4 == "xxx") ? "ALL" : $4
  print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/splitByAttributes/"$1"/"$3"."Lc"."$2".AllAg.AllCell.bed "$1" "$3" "$4" "$2" \""$5"\" \""$6"\""
}'| bin/splitQsub -p $splitN -j splitSxS -t tmp/splitSxS -k "-l s_vmem=16G,mem_req=16G"

~/bin/wait4qsub splitSxS   # 3.0 hrs / 600 threads
~/bin/wait4qsub makeBed9GF # 19 hrs  / 600 threads


# experimentList.tab, fileList.tab などの作成
qsub -e assembleList.log.txt -o assembleList.log.txt -N makeList chipatlas/sh/assembleList.sh
~/bin/wait4qsub makeList # 19 hrs  / 600 threads

# lineNum.tsv の作成
echo tmpDirForPeakBrowser/linNum/*| xargs awk -F '/.' '{
  print FILENAME "\t" $1
}'| tr '/.' '\t\t'| awk -F '\t' -v OFS='\t' '
BEGIN {
  while ((getline < "chipatlas/sh/abbreviationList_AG.tab") > 0) a[$1] = $2
  while ((getline < "chipatlas/sh/abbreviationList_CT.tab") > 0) c[$1] = $2
} {
  print $3, a[$4], c[$5], $6, $10
}' > chipatlas/lib/inSilicoChIP/lineNum.tsv

# allPeaks_light の更新
for genome in `ls chipatlas/results`; do
  mv -f tmpDirForPeakBrowser/allPeaks_light/$genome/* chipatlas/results/$genome/allPeaks_light/
done

# public を削除
for genome in `ls chipatlas/results`; do
  echo chipatlas/results/$genome/public/* chipatlas/lib/inSilicoChIP/results/$genome/public/*
done| tr ' ' '\n'| awk '{
  print "rm -f "$1
}'| bin/splitQsub -p $splitN -j rmPublic
~/bin/wait4qsub rmPublic # 3 min  / 300 threads

# public, in silico ChIP データを移動
for genome in `ls chipatlas/results`; do
  rm -rf chipatlas/results/$genome/public
  rm -rf chipatlas/lib/inSilicoChIP/results/$genome/public
  mv tmpDirForPeakBrowser/public/$genome chipatlas/results/$genome/public
  mv tmpDirForPeakBrowser/inSilicoChIP/$genome chipatlas/lib/inSilicoChIP/results/$genome/public
done

exit
