#!/bin/sh #$ -S /bin/sh # 全ての ChIP + Open chromatin の BED をソートする # qsub -e makePeakBrowser.log.txt -o makePeakBrowser.log.txt chipatlas/sh/makePeakBrowser/makePeakBrowser.sh 600 splitN=$1 : > makePeakBrowser.log.txt # Curation 済みの ag_Statistics-hg19-tab.tsv を ag_Statistics-hg38-tab.tsv にコピー for org in `cat chipatlas/sh/preferences.txt| awk '$1 == "Genome"'| cut -f2| tr ' ' '\n'| cut -d = -f1| grep ,`; do ls chipatlas/classification/*Statistics-*-tab.tsv| awk ' BEGIN { split("'$org'", a, ",") } $1 ~ a[1] { printf "cat " $1 sub(a[1], a[2], $1) print " > " $1 }' done| sh # ag_Statistics-hg19-tab.tsv を ag_Index.hg19.tab に変形 for TSV in `ls chipatlas/classification/*Statistics-*-tab.tsv`; do outIndex=`echo $TSV| sed 's/Statistics-/Index./'| sed 's/-tab.tsv/.tab/'` tail -n +2 $TSV| awk -F '\t' '$4 != $5 { gsub(/?[0-9]/, "", $5) print $4 "\t" $5 }' > $outIndex done # input|NA InP@ Input # H3K9me3|AB8898 His@ H3K9me3 # anti-H3 (Abcam 1791) His@ H3 # PeakBrowser 用の metadata を作成 GENOME=`ls chipatlas/results` ls chipatlas/lib/metadata/NCBI_SRA_Metadata_Full_20*| tail -n1| xargs cat| awk -F '\t' -v GENOME="$GENOME" ' BEGIN { for (i=1; i<=split(GENOME, g, " "); i++) { genome = g[i] fn1 = "chipatlas/classification/ag_Index."genome".tab" fn2 = "chipatlas/classification/ct_Index."genome".tab" fn3 = "chipatlas/results/"genome"/tag/allTags.txt" while ((getline < fn1) > 0) a[genome, $1] = $2 while ((getline < fn2) > 0) l[genome, $1] = $2 while ((getline < fn3) > 0) { A[$1] = a[genome, $5] C[$1] = l[genome, $7] G[$1] = genome } } } A[$1] { printf "%s", $1 "\t" G[$1] "\t" A[$1] "\t" C[$1] "\t" $3 "\t" $4 for (i=18; i<=NF; i++) printf "\t%s", $i printf "\n" }'| awk -F '\t' ' function symbolSub(Str,underScore) { gsub("%", "%25", Str) # IGV で表示できない文字 %+;=" space を URL エンコーディングに変換 gsub("+", "%2B", Str) gsub(";", "%3B", Str) gsub("=", "%3D", Str) gsub("\"", "%22", Str) gsub(" ", "%20", Str) if (underScore == "_") gsub(underScore, "%20", Str) return Str } BEGIN { while ((getline < "chipatlas/sh/abbreviationList_AG.tab") > 0) AGL[$1] = $2 while ((getline < "chipatlas/sh/abbreviationList_CT.tab") > 0) CTL[$1] = $2 } { SRX = $1 Title[SRX] = symbolSub($5,"_") sub("xxx", "NA", Title[SRX]) ag = substr($3, 6) ct = substr($4, 6) if (substr($4, 1, 3) == "Unc") cellGroup = "NA" # Unc の場合 else cellGroup = symbolSub(CTL[substr($4, 1, 3)],"_") fi for (i=7; i<=NF; i++) { str = $i sub("=", SUBSEP, str) str = symbolSub(str) split(str, arr, SUBSEP) Atrb[SRX] = Atrb[SRX] arr[1] "=" arr[2] ";" } printf "%s", $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" printf "ID=%s;", SRX printf "Name=%s%20", symbolSub(ag,"_") printf "(@%20" printf "%s", symbolSub(ct,"_") printf ");" printf "Title=%s;", Title[SRX] printf "%s%s;", "Cell%20group=", cellGroup printf "
%s", Atrb[SRX] for (i=7; i<=NF; i++) printf "\t%s", $i printf "\n" }' > chipatlas/lib/metadata/metadataForPeakBrowser.tsv # 1 DRX000201 DRX000203 # SRX # 2 mm9 mm9 # Genome (mm9 と mm10 の重複なし) # 3 Unc@ Unclassified Unc@ Unclassified # 抗原 # 4 Myo@ C2C12 Myo@ C2C12 # 細胞 # 5 C2C12_Growth_H3.3 C2C12_Chd2mir3139_Growth_H3.3 # タイトル (GSM 含む) # 6 ChIP-Seq ChIP-Seq # Seq タイプ # 7 ID=DRX000201;Name=Unclassified ID=DRX000203;Name=Unclassified # Peak Browser のメタデータ # 8 sample_name=DRS000203 sample_name=DRS000204 # 以降、オリジナルのメタデータ # 9 strain=C2C12 strain=C2C12 # 10 sample comment=source: C2C12 c sample comment=source:C2C12 ce # 11 cell type=myoblast cells cell type=myoblast cells ls tmpDirForPeakBrowser && mv tmpDirForPeakBrowser tmpDirForPeakBrowser_old && rm -rf tmpDirForPeakBrowser_old & mkdir -p tmpDirForPeakBrowser/splitByChr mkdir -p tmpDirForPeakBrowser/sortByChr mkdir -p tmpDirForPeakBrowser/sortedBed mkdir -p tmpDirForPeakBrowser/splitByAttributes # 全ての BED ファイルを chr ごとに分割する for genome in `ls chipatlas/results`; do for qval in 05 10 20; do mkdir -p tmpDirForPeakBrowser/splitByChr/$genome/$qval echo chipatlas/results/$genome/Bed$qval/Bed/*bed done for mr in hmr hypermr pmd; do mkdir -p tmpDirForPeakBrowser/splitByChr/$genome/bmap echo chipatlas/results/$genome/bmap/$mr/Bed/*.bed done done| tr ' ' '\n'| awk -F '[/.]' '{ if ($4 == "bmap") print "sh chipatlas/sh/makePeakBrowser/splitByChr.sh", $0, $3, $5, $7, "bmap" else print "sh chipatlas/sh/makePeakBrowser/splitByChr.sh", $0, $3, $7, $6, "chip" }'| bin/splitQsub -p $splitN -j splitByChr -t tmp/splitByChr ~/bin/wait4qsub splitByChr # 31 min / 600 threads, 130 min / 100 threads # 全ての BED ファイルを ID ごとにソートする for genome in `ls chipatlas/results`; do for type in 05 10 20 bmap; do echo tmpDirForPeakBrowser/splitByChr/$genome/$type/*/*| tr ' ' '\n' done done| tr ' ' '\n'| awk '{ print "sh chipatlas/sh/makePeakBrowser/sortById.sh", $1 }'| bin/splitQsub -p $splitN -j sortById -t tmp/sortById ~/bin/wait4qsub sortById # 14 min / 600 threads, 9 min / 100 threads # 全ての BED ファイルを chr ごとにソートする for genome in `ls chipatlas/results`; do for type in 05 10 20 bmap; do for chr in `cut -f1 chipatlas/lib/genome_size/$genome.chrom.sizes`; do echo "sh chipatlas/sh/makePeakBrowser/sortByChr.sh $genome $type $chr" done done done| bin/splitQsub -p $splitN -j sortByChr -t tmp/sortByChr ~/bin/wait4qsub sortByChr # 5 min / 600 threads, 8 min / 100 threads # 全ての chr ファイルを連結する for genome in `ls chipatlas/results`; do for type in 05 10 20 bmap; do qsub -e /dev/null -o /dev/null -N sortAll chipatlas/sh/makePeakBrowser/sortAll.sh $genome $type done done # temp ファイルの削除 for genome in `ls chipatlas/results`; do for type in 05 10 20 bmap; do echo tmpDirForPeakBrowser/splitByChr/$genome/$type/* done done| tr ' ' '\n'| awk '{ print "rm -rf " $0 }'| bin/splitQsub -p $splitN -j delTemp -t tmp/delTemp ~/bin/wait4qsub sortAll # 5 min / 600 threads, 5 min / 100 threads ~/bin/wait4qsub delTemp # 1 min / 600 threads, 1 min / 100 threads # 不要なファイルの削除 rm -rf tmpDirForPeakBrowser/sortByChr rm -rf tmpDirForPeakBrowser/splitByChr rm -rf tmp/splitByChr rm -rf tmp/sortById rm -rf tmp/sortByChr rm -rf tmp/delTemp # bmap を bs というファイル名に変更 echo tmpDirForPeakBrowser/sortedBed/*/*bmap.bed| tr ' ' '\n'| awk '{ printf "mv " $1 sub("bmap", "bs", $1) print " " $1 }'| sh # allPeaks_light の作成、insilicoChIP のためのファイルの作成、summary/allLineNum.txt の作成 rm -rf tmpDirForPeakBrowser/allPeaks_light rm -rf tmpDirForPeakBrowser/inSilicoChIP rm -rf tmpDirForPeakBrowser/linNum mkdir -p tmpDirForPeakBrowser/linNum for genome in `ls chipatlas/results`; do mkdir -p tmpDirForPeakBrowser/allPeaks_light/$genome mkdir -p tmpDirForPeakBrowser/inSilicoChIP/$genome for qval in 05 10 20 50 bs; do qsub -e /dev/null -o /dev/null chipatlas/sh/makePeakBrowser/allPeaks_light.sh $genome $qval done done # 抗原大・細胞大で split rm -rf tmpDirForPeakBrowser/splitByAttributes rm -rf tmpDirForPeakBrowser/public rm -rf tmpDirForPeakBrowser/bedList for genome in `ls chipatlas/results`; do mkdir -p tmpDirForPeakBrowser/splitByAttributes/$genome mkdir -p tmpDirForPeakBrowser/public/$genome mkdir -p tmpDirForPeakBrowser/bedList/$genome done GENOME=`ls -1 chipatlas/results| tr '\n' ' '` cat chipatlas/lib/metadata/metadataForPeakBrowser.tsv| awk -F '\t' -v GENOME="$GENOME" '{ gsub(/[0-9]/, "", $2) g[$2]++ A[substr($3,1,3)]++ C[substr($4,1,3)]++ C["xxx"]++ a[$2, substr($3,1,3)]++ c[$2, substr($4,1,3)]++ c[$2, "xxx"]++ } END { for (i=1; i<=split(GENOME, x, " "); i++) G[x[i]]++ for (i=1; i<=split("05 10 20 50", y, " "); i++) Q[y[i]]++ for (genome in G) { gen = genome gsub(/[0-9]/, "", gen) for (qval in Q) for (ag in A) for (ct in C) { if (a[gen, ag] > 0 && c[gen, ct] > 0 && ag != "BSF") { print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/sortedBed/"genome"/"genome"."qval".bed "genome" "ag" "ct" "qval" xxx xxx" } } for (ct in C) { if (c[gen, ct] > 0) { print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/sortedBed/"genome"/"genome".bs.bed "genome" BSF "ct" bs xxx xxx" } } } }'| bin/splitQsub -p $splitN -j splitLxL -t tmp/splitLxL -k "-l s_vmem=16G,mem_req=16G" ~/bin/wait4qsub splitLxL # 3.3 hrs / 600 threads # 抗原小・細胞小で split ls -l tmpDirForPeakBrowser/splitByAttributes/*/*bed| awk 'NF > 4 && $5 > 0 { print $NF }'| tr '/.' '\t\t'| awk -F '\t' -v OFS='\t' '$(NF-4) != "ALL" { print $(NF-6), $(NF-5), $(NF-4), $(NF-3) }'| awk -F '\t' -v OFS='\t' ' BEGIN { while ((getline < "chipatlas/lib/metadata/metadataForPeakBrowser.tsv") > 0) { gen = $2 gsub(/[0-9]/, "", gen) agL = substr($3,1,3) ctL = substr($4,1,3) agS = substr($3,6) ctS = substr($4,6) if (!ww[gen, agL, ctL, agS]++) a[gen, agL, ctL] = a[gen, agL, ctL] SUBSEP agS if (!xx[gen, agL, ctL, ctS]++) c[gen, agL, ctL] = c[gen, agL, ctL] SUBSEP ctS } for (j=1; j<=split("hypermr hmr pmd", b, " "); j++) B[b[j]]++ } { genome = $1 gen = $1 gsub(/[0-9]/, "", gen) agL = $2 ctL = $3 qvl = $4 N = split(a[gen, agL, ctL], A, SUBSEP) if (agL == "BSF") { for (mr in B) print genome, qvl, agL, "xxx", mr, "xxx" for (mr in B) print genome, qvl, agL, ctL, mr, "xxx" } else { for (i=2; i<=N; i++) print genome, qvl, agL, "xxx", A[i], "xxx" for (i=2; i<=N; i++) print genome, qvl, agL, ctL, A[i], "xxx" } delete A N = split(c[gen, agL, ctL], C, SUBSEP) for (i=2; i<=N; i++) print genome, qvl, agL, ctL, "xxx", C[i] delete C }'| awk '!a[$0]++'| awk -F '\t' -v OFS='\t' '{ Lc = ($4 == "xxx") ? "ALL" : $4 print "sh chipatlas/sh/makePeakBrowser/splitByAttributes.sh tmpDirForPeakBrowser/splitByAttributes/"$1"/"$3"."Lc"."$2".AllAg.AllCell.bed "$1" "$3" "$4" "$2" \""$5"\" \""$6"\"" }'| bin/splitQsub -p $splitN -j splitSxS -t tmp/splitSxS -k "-l s_vmem=16G,mem_req=16G" ~/bin/wait4qsub splitSxS # 3.0 hrs / 600 threads ~/bin/wait4qsub makeBed9GF # 19 hrs / 600 threads # experimentList.tab, fileList.tab などの作成 qsub -e assembleList.log.txt -o assembleList.log.txt -N makeList chipatlas/sh/assembleList.sh ~/bin/wait4qsub makeList # 19 hrs / 600 threads # lineNum.tsv の作成 echo tmpDirForPeakBrowser/linNum/*| xargs awk -F '/.' '{ print FILENAME "\t" $1 }'| tr '/.' '\t\t'| awk -F '\t' -v OFS='\t' ' BEGIN { while ((getline < "chipatlas/sh/abbreviationList_AG.tab") > 0) a[$1] = $2 while ((getline < "chipatlas/sh/abbreviationList_CT.tab") > 0) c[$1] = $2 } { print $3, a[$4], c[$5], $6, $10 }' > chipatlas/lib/inSilicoChIP/lineNum.tsv # allPeaks_light の更新 for genome in `ls chipatlas/results`; do mv -f tmpDirForPeakBrowser/allPeaks_light/$genome/* chipatlas/results/$genome/allPeaks_light/ done # public を削除 for genome in `ls chipatlas/results`; do echo chipatlas/results/$genome/public/* chipatlas/lib/inSilicoChIP/results/$genome/public/* done| tr ' ' '\n'| awk '{ print "rm -f "$1 }'| bin/splitQsub -p $splitN -j rmPublic ~/bin/wait4qsub rmPublic # 3 min / 300 threads # public, in silico ChIP データを移動 for genome in `ls chipatlas/results`; do rm -rf chipatlas/results/$genome/public rm -rf chipatlas/lib/inSilicoChIP/results/$genome/public mv tmpDirForPeakBrowser/public/$genome chipatlas/results/$genome/public mv tmpDirForPeakBrowser/inSilicoChIP/$genome chipatlas/lib/inSilicoChIP/results/$genome/public done exit