#!/bin/sh
#$ -S /bin/sh

# qsub chipatlas/sh/assembleList.sh   # makePeakBrowser.sh の最後に実行

# ATCC, Yun etal などをまとめる
for fn in `ls chipatlas/sh/cellTypeDescription/*.tab`; do
  cat $fn| tr -d '=|'| awk -F '\t' -v fn=$fn '{
    if (fn ~ "atccCollection.tab") {
      printf "2\t%s", $2
      if ($6) printf "\tTissue=%s", $6
      if ($4) printf "\tCell Type=%s", $4
      if ($5) printf "\tDisease=%s", $5
      printf "\n"
    }
    if (fn ~ "ENCODE.tab") {
      printf "4\t%s", $1
      if ($6) printf "\tTissue=%s", $6
      if ($5) printf "\tLineage=%s", $5
      if ($4) printf "\tDescription=%s", $4
      printf "\n"
    }
    if (fn ~ "FlyBaseCollection.tab") {
      printf "5\t%s", $1
      if ($3 && $3 != "-") printf "\tSource=%s", $3
      if ($4 && $4 != "-") printf "\tTissue Source=%s", $4
      if ($5 && $5 != "-") printf "\tDevelopmental Stage=%s", $5
      printf "\n"
    }
    if (fn ~ "mesh") {
      printf "3\t%s", $2
      if ($3) printf "\tMeSH Description=%s", $3
      printf "\n"
    }
    if (fn ~ "Yu_etal.tab") {
      printf "1\t%s", $1
      if ($4) printf "\tPrimary Tissue=%s", $4
      if ($5) printf "\tSite of Extraction=%s", $5
      if ($6) printf "\tTissue Diagnosis=%s", $6
      printf "\n"
    }
  }'
done| awk -F '\t' '{
  printf "%s\t%s\t", $1, $2
  if (NF == 2) print "-"
  else {
    Desc = ""
    for (i=3; i<=NF; i++) Desc = Desc "|" $i
    sub ("\\|", "", Desc)
    print Desc
  }
}' > chipatlas/sh/cellTypeDescription/cellTypeDescription.txt
      # 1       K-562   Primary Tissue=Blood|Tissue Diagnosis=Leukemia Chronic Myelogenous

# Summary をまとめる
rm -f chipatlas/results/*/summary/allSummary.txt
rm -f chipatlas/results/*/bmap/summary/allSummary.txt
rm -f chipatlas/results/*/summary/tmp*txt
rm -f chipatlas/results/*/bmap/summary/tmp*txt
{
  echo chipatlas/results/*/summary/*RX*.txt| tr ' ' '\n'| awk -F '/' '{
    print "cat "$0" >> chipatlas/results/"$3"/summary/tmp.$JOB_ID.txt"
  }'
  echo chipatlas/results/*/bmap/summary/*RX*.txt| tr ' ' '\n'| awk -F '/' '{
    print "cat "$0" >> chipatlas/results/"$3"/bmap/summary/tmp.$JOB_ID.txt"
  }'
}| bin/splitQsub -p 300 -j Summary -t tmp/Summary
~/bin/wait4qsub Summary

for genome in `ls chipatlas/results`; do
  cat chipatlas/results/$genome/summary/tmp*txt| awk '$1 ~ "RX"'| sort > chipatlas/results/$genome/summary/allSummary.txt
  cat chipatlas/results/$genome/bmap/summary/tmp*txt| awk '$1 ~ "RX"'| sort > chipatlas/results/$genome/bmap/summary/allSummary.txt
  rm chipatlas/results/$genome/summary/tmp*txt
  rm chipatlas/results/$genome/bmap/summary/tmp*txt
done

# experimentList.tab の作成
for genome in `ls chipatlas/results`; do
  sum1=chipatlas/results/$genome/summary/allSummary.txt
  sum2=chipatlas/results/$genome/bmap/summary/allSummary.txt
  cat $sum1 $sum2| awk -F '\t' -v genome=$genome -v OFS='\t' '
  BEGIN {
    fn1 = "chipatlas/sh/abbreviationList_AG.tab"
    fn2 = "chipatlas/sh/abbreviationList_CT.tab"
    fn3 = "chipatlas/sh/cellTypeDescription/cellTypeDescription.txt"
    fn4 = "chipatlas/classification/ct_Statistics-"genome"-tab.tsv"
    fn5 = "chipatlas/results/"genome"/bmap/summary/allLineNum.txt"
    fn6 = "chipatlas/results/"genome"/summary/allLineNum.txt"
    fn7 = "chipatlas/lib/metadata/metadataForPeakBrowser.tsv"
    while ((getline < fn1) > 0) a[$1] = $2
    while ((getline < fn2) > 0) c[$1] = $2
    while ((getline < fn3) > 0) d[$2"?"$1] = $3
    while ((getline < fn4) > 0) {
      split(substr($5, 6), s, "?")
      D[s[1]] = d[substr($5, 6)]
    }
    while ((getline < fn5) > 0) n[$1] = $2
    while ((getline < fn6) > 0) n[$1] = $2
    while ((getline < fn7) > 0) {
      agL[$1] = a[substr($3, 1, 3)]
      agS[$1] = substr($3, 6)
      ctL[$1] = c[substr($4, 1, 3)]
      ctS[$1] = substr($4, 6)
      Des[$1] = D[substr($4, 6)]
      if (length(Des[$1]) == 0) Des[$1] = "NA"
      A[$1] = substr($3, 1, 3)
      q[$1] = ($5 == "xxx") ? "-" : $5
      gsub("_", " ", q[$1])
      for (i=8; i<=NF; i++) q[$1] = q[$1] "\t" $i
    }
  } {
    if (A[$1] == "BSF") y = sprintf("%d,%.1f,%.1f,%d", $5, $6, $8, n[$1])
    else                y = sprintf("%d,%.1f,%.1f,%d", $5, $7, $8, n[$1])
    print $1, genome, agL[$1], agS[$1], ctL[$1], ctS[$1], Des[$1], y, q[$1]
  }'| awk -F '\t' 'length($3$4$5$6) > 0'
done > chipatlas/lib/assembled_list/experimentList.tab

# Peaks 数のまとめ
echo chipatlas/results/*/Bed*/Bed/*RX*bed chipatlas/results/*/bmap/*/Bed/*RX*bed| tr ' ' '\n'| awk '{
  print "wc -l " $1
}'| ~/bin/splitQsub -p600 -j PeakNum -t tmp/tmpDirPeakNum
~/bin/wait4qsub PeakNum

cat tmp/tmpDirPeakNum/log/*log.txt| awk -F '[ /]' '{
  print $1, $4, $NF
}'| awk -F '[ .]' -v OFS='\t' '{
  print $3, $2, $4, $1
}'| sort > chipatlas/lib/assembled_list/peakNumber.tsv
rm -rf tmp/tmpDirPeakNum

cat << '======================' > /dev/null

cat chipatlas/lib/assembled_list/experimentList.tab| grep SRX019491| coln
 1  SRX019491             SRX019491             # $1 = SRX                        
 2  hg19                  hg38                  # $2 = Genome                     
 3  Input control         Input control         # $3 = 抗原 大                     
 4  Input control         Input control         # $4 = 抗原 小                     
 5  Adipocyte             Adipocyte             # $5 = 細胞 大                     
 6  Adipose stromal cell  Adipose stromal cell  # $6 = 細胞 小                     
 7  NA                    NA                    # $7 = 細胞小の記述 (ない場合は NA)   
 8  70504198,54.2,5.1,13  70504198,55.6,3.6,17  # $8 = リード情報 # of reads, % mapped, % duplicates, # of peaks [Q < 1E-05]
 9  GSM534464: hASC WCE   GSM534464: hASC WCE   # $9 = フルタイトル (GSM 含む)       
10  source_name=hASC, Wh  source_name=hASC, Wh  # $10 以降 : メタデータ (タブ区切り)  
11  cell type=Adipose st  cell type=Adipose st  
12  chip epitope=None     chip epitope=None     
13  chip antibody=None    chip antibody=None    

cat chipatlas/lib/assembled_list/experimentList.tab| grep SRX155001| coln -c25
 1  SRX155001                  SRX155001                  # $1 = SRX                        
 2  hg19                       hg38                       # $2 = Genome                     
 3  Bisulfite-Seq              Bisulfite-Seq              # $3 = 抗原 大                     
 4  Bisulfite-Seq              Bisulfite-Seq              # $4 = 抗原 小                     
 5  Uterus                     Uterus                     # $5 = 細胞 大                     
 6  HeLa                       HeLa                       # $6 = 細胞 小                     
 7  Primary Tissue=Cervix|Tis  Primary Tissue=Cervix|Tis  # $7 = 細胞小の記述 (ない場合は NA)   
 8  132179670,88.9,3.7,304569  132179672,88.1,3.4,311292  # $8 = リード情報 # of reads, % mapped, x coverage, # of hypermr
 9  GSM949621: Input standard  GSM949621: Input standard  # $9 = フルタイトル (GSM 含む)       
10  source_name=HeLa-S3        source_name=HeLa-S3        # $10 以降 : メタデータ (タブ区切り)  
11  cell line=HeLa cervical c  cell line=HeLa cervical c  
12  chip antibody=none         chip antibody=none         
13  starting library amount=s  starting library amount=s  
======================

# WEB 検索用 HTML の作成
sh chipatlas/sh/refineSearchList.sh

# 各 SRX のライブラリ情報や解析ログに関する json の作成
sh chipatlas/sh/expF_tab_to_json.sh

# filelist.tab の作成
echo tmpDirForPeakBrowser/bedList/*/*.bedList.txt| xargs cat| awk -F '\t' -v OFS='\t' '{
  sub(/.bed$/, "", $1)
  print
}'| awk -F '\t' -v OFS='\t' '{
  if ($3 == "Bisulfite-Seq" || $3 == "DNase-seq" || $3 == "ATAC-Seq") {
    if ($4 == "-") print # Peak Browser の Bisulfite, DNase, ATAC の抗原小は NA にする
  } else {
    print
  }
}' > chipatlas/lib/assembled_list/fileList.tab
  # $1 = ファイル名                     His.Lar.10.H3K4me3.AllCell.bed
  # $2 = Genome                       ce10
  # $3 = 抗原 大                       Histone
  # $4 = 抗原 小 (AllAg の場合は -)     H3K4me3
  # $5 = 細胞 大                       Larvae
  # $6 = 細胞 小 (AllCell の場合は -)   -
  # $7 = q-Val                        10
  # $8 = SRX (コンマ区切り)             SRX059255,SRX063957,SRX059274,SRX059273,SRX059254

# Antigen, CellType リスト の作成
cat chipatlas/lib/assembled_list/experimentList.tab| sort| awk -F '\t' '
BEGIN {
  print "Genome\tAntigen_class\tAntigen\tNum_data\tID" > "chipatlas/lib/assembled_list/antigenList.tab"
  print "Genome\tCell_type_class\tCell_type\tNum_data\tID" > "chipatlas/lib/assembled_list/celltypeList.tab"
  cmd = "sort"
} {
  a[$2 "\t" $3 "\t" $4]++
  c[$2 "\t" $5 "\t" $6]++
  srxA[$2 "\t" $3 "\t" $4] = srxA[$2 "\t" $3 "\t" $4] "," $1
  srxC[$2 "\t" $5 "\t" $6] = srxC[$2 "\t" $5 "\t" $6] "," $1
} END {
  for (key in a) {
    sub(",", "", srxA[key])
    print key "\t" a[key] "\t" srxA[key] |& cmd
  }
  close(cmd, "to")
  while((cmd |& getline var) > 0) print var >> "chipatlas/lib/assembled_list/antigenList.tab"
  close(cmd)
  
  for (key in c) {
    sub(",", "", srxC[key])
    print key "\t" c[key] "\t" srxC[key] |& cmd
  }
  close(cmd, "to")
  while((cmd |& getline var) > 0) print var >> "chipatlas/lib/assembled_list/celltypeList.tab"
  close(cmd)
}'

# coln chipatlas/lib/assembled_list/antigenList.tab
#  1  Genome                ce10                  
#  2  Antigen_class         ATAC-Seq              
#  3  Antigen               ATAC-Seq              
#  4  Num_data              90                    
#  5  ID                    SRX2332995,SRX233299  

# coln chipatlas/lib/assembled_list/celltypeList.tab 
#  1  Genome                ce10                  
#  2  Cell_type_class       Adult                 
#  3  Cell_type             Adult                 
#  4  Num_data              83                    
#  5  ID                    SRX076077,SRX076078,  
exit
