# Time-stamp: <2014-06-17 01:39:56 Tao Liu> """Module Description Copyright (c) 2008,2009 Yong Zhang, Tao Liu Copyright (c) 2010,2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ #import os #from array import array #from copy import deepcopy from itertools import groupby from operator import itemgetter import io #import subprocess import gc # use garbage collectior from MACS2.IO.cPeakIO import PeakIO from MACS2.IO.cBedGraphIO import bedGraphIO from MACS2.Constants import * from MACS2.IO.cCallPeakUnit import CallerFromAlignments cdef str subpeak_letters(short i): if i < 26: return chr(97+i) else: return subpeak_letters(i / 26) + chr(97 + (i % 26)) class PeakDetect: """Class to do the peak calling. e.g >>> from MACS2.cPeakDetect import cPeakDetect >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000) >>> pd.call_peaks() """ def __init__ (self,opt = None,treat = None, control = None, d = None, slocal = None, llocal = None): """Initialize the PeakDetect object. """ self.opt = opt self.info = opt.info self.debug = opt.debug self.warn = opt.warn self.treat = treat self.control = control self.ratio_treat2control = None self.peaks = None self.final_peaks = None self.PE_MODE = opt.PE_MODE self.scoretrack = None #self.femax = opt.femax #self.femin = opt.femin #self.festep = opt.festep self.log_pvalue = opt.log_pvalue # -log10pvalue self.log_qvalue = opt.log_qvalue # -log10qvalue if d != None: self.d = d else: self.d = self.opt.d self.end_shift = self.opt.shift self.gsize = opt.gsize self.nolambda = opt.nolambda if slocal != None: self.sregion = slocal else: self.sregion = opt.smalllocal if llocal != None: self.lregion = llocal else: self.lregion = opt.largelocal if (self.nolambda): self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!") #self.diag = opt.diag #self.save_score = opt.store_score #self.zwig_tr = opt.zwig_tr #self.zwig_ctl= opt.zwig_ctl def call_peaks (self): """Call peaks function. Scan the whole genome for peaks. RESULTS WILL BE SAVED IN self.final_peaks and self.final_negative_peaks. """ if self.control: # w/ control #if self.opt.broad: # (self.peaks,self.broadpeaks) = self.__call_peaks_w_control() #else: self.peaks = self.__call_peaks_w_control () else: # w/o control #if self.opt.broad: # (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control() #else: self.peaks = self.__call_peaks_wo_control () return self.peaks def __call_peaks_w_control (self): """To call peaks with control data. A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment) <-- tuple type While calculating pvalue: First, t and c will be adjusted by the ratio between total reads in treatment and total reads in control, depending on --to-small option. Then, t and c will be multiplied by the smallest peak size -- self.d. Finally, a poisson CDF is applied to calculate one-side pvalue for enrichment. """ cdef: int i float lambda_bg, effective_depth_in_million float treat_scale, d list ctrl_scale_s, ctrl_d_s long treat_total, control_total long treat_sum # approx sum of treatment pileup values long control_sum # approx sum of control pileup values treat_total = self.treat.total if self.PE_MODE: d = self.treat.average_template_length control_total = self.control.total * 2 # in PE mode, entire fragment is counted as 1 # in treatment whereas both ends of fragment are counted in control/input. treat_sum = self.treat.length control_sum = control_total * self.treat.average_template_length self.ratio_treat2control = float(treat_sum)/control_sum else: d = self.d control_total = self.control.total treat_sum = self.treat.total * self.d control_sum = self.control.total * self.d self.ratio_treat2control = float(treat_sum)/control_sum if self.opt.ratio != 1.0: self.ratio_treat2control = self.opt.ratio if self.opt.tocontrol: # if MACS decides to scale treatment to control data because treatment is bigger effective_depth_in_million = control_total / 1000000.0 lambda_bg = float( control_sum )/ self.gsize treat_scale = 1/self.ratio_treat2control else: # if MACS decides to scale control to treatment because control sample is bigger effective_depth_in_million = treat_total / 1000000.0 lambda_bg = float( treat_sum )/ self.gsize treat_scale = 1.0 # prepare d_s for control data if self.sregion: assert self.d <= self.sregion, "slocal can't be smaller than d!" if self.lregion: assert self.d <= self.lregion , "llocal can't be smaller than d!" assert self.sregion <= self.lregion , "llocal can't be smaller than slocal!" # Now prepare a list of extension sizes ctrl_d_s = [ self.d ] # note, d doesn't make sense in PE mode. # And a list of scaling factors for control ctrl_scale_s = [] # d if not self.opt.tocontrol: # if user wants to scale everything to ChIP data tmp_v = self.ratio_treat2control else: tmp_v = 1.0 ctrl_scale_s.append( tmp_v ) # slocal size local if self.sregion: ctrl_d_s.append( self.sregion ) if not self.opt.tocontrol: # if user want to scale everything to ChIP data tmp_v = float(self.d)/self.sregion*self.ratio_treat2control else: tmp_v = float(self.d)/self.sregion ctrl_scale_s.append( tmp_v ) # llocal size local if self.lregion and self.lregion > self.sregion: ctrl_d_s.append( self.lregion ) if not self.opt.tocontrol: # if user want to scale everything to ChIP data tmp_v = float(self.d)/self.lregion*self.ratio_treat2control else: tmp_v = float(self.d)/self.lregion ctrl_scale_s.append( tmp_v ) #if self.PE_MODE: # first d/scale are useless in PE mode # ctrl_d_s = ctrl_d_s[1:] # ctrl_scale_s = ctrl_scale_s[1:] # print ctrl_d_s # print ctrl_scale_s if self.nolambda: ctrl_d_s = [] ctrl_scale_s = [] scorecalculator = CallerFromAlignments( self.treat, self.control, d = d, ctrl_d_s = ctrl_d_s, treat_scaling_factor = treat_scale, ctrl_scaling_factor_s = ctrl_scale_s, end_shift = self.end_shift, lambda_bg = lambda_bg, save_bedGraph = self.opt.store_bdg, bedGraph_filename_prefix = self.opt.name, bedGraph_treat_filename = self.opt.bdg_treat, bedGraph_control_filename = self.opt.bdg_control, save_SPMR = self.opt.do_SPMR) if self.opt.trackline: scorecalculator.enable_trackline() # call peaks call_summits = self.opt.call_summits if call_summits: self.info("#3 Going to call summits inside each peak ...") if self.log_pvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) ) peaks = scorecalculator.call_broadpeaks(['p',], lvl1_cutoff_s=[self.log_pvalue,],lvl2_cutoff_s=[self.opt.log_broadcutoff,],min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4) else: self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue) peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,], min_length=self.d, max_gap=self.opt.tsize, call_summits=call_summits) elif self.log_qvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) ) peaks = scorecalculator.call_broadpeaks(['q',], lvl1_cutoff_s=[self.log_qvalue,],lvl2_cutoff_s=[self.opt.log_broadcutoff,],min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4) else: peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,], min_length=self.d, max_gap=self.opt.tsize, call_summits=call_summits) return peaks def __call_peaks_wo_control (self): """To call peaks without control data. A peak info type is a: dictionary key value: chromosome items: (peak start,peak end, peak length, peak summit, peak height, number of tags in peak region, peak pvalue, peak fold_enrichment) <-- tuple type While calculating pvalue: First, t and c will be adjusted by the ratio between total reads in treatment and total reads in control, depending on --to-small option. Then, t and c will be multiplied by the smallest peak size -- self.d. Finally, a poisson CDF is applied to calculate one-side pvalue for enrichment. """ cdef float lambda_bg, effective_depth_in_million cdef float treat_scale = 1 cdef float d cdef list ctrl_scale_s, ctrl_d_s if self.PE_MODE: d = 0 else: d = self.d treat_length = self.treat.length treat_total = self.treat.total effective_depth_in_million = treat_total / 1000000.0 # global lambda if self.PE_MODE: # # this an estimator, we should maybe test it for accuracy? lambda_bg = treat_length / self.gsize else: lambda_bg = float(d) * treat_total / self.gsize treat_scale = 1.0 # slocal and d-size local bias are not calculated! # nothing done here. should this match w control?? if not self.nolambda: if self.PE_MODE: ctrl_scale_s = [ float(treat_length) / (self.lregion*treat_total*2), ] else: ctrl_scale_s = [ float(self.d) / self.lregion, ] ctrl_d_s = [ self.lregion, ] else: ctrl_scale_s = [] ctrl_d_s = [] scorecalculator = CallerFromAlignments( self.treat, None, d = d, ctrl_d_s = ctrl_d_s, treat_scaling_factor = treat_scale, ctrl_scaling_factor_s = ctrl_scale_s, end_shift = self.end_shift, lambda_bg = lambda_bg, save_bedGraph = self.opt.store_bdg, bedGraph_filename_prefix = self.opt.name, bedGraph_treat_filename = self.opt.bdg_treat, bedGraph_control_filename = self.opt.bdg_control, save_SPMR = self.opt.do_SPMR ) # calculate pvalue scores if self.opt.trackline: self.scoretrack.enable_trackline() # call peaks call_summits = self.opt.call_summits if call_summits: self.info("#3 Going to call summits inside each peak ...") if self.log_pvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) ) peaks = scorecalculator.call_broadpeaks(['p',], lvl1_cutoff_s=[self.log_pvalue,],lvl2_cutoff_s=[self.opt.log_broadcutoff,],min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4) else: self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue) peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,], min_length=self.d, max_gap=self.opt.tsize, call_summits=call_summits) elif self.log_qvalue: if self.opt.broad: self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) ) peaks = scorecalculator.call_broadpeaks(['q',], lvl1_cutoff_s=[self.log_qvalue,],lvl2_cutoff_s=[self.opt.log_broadcutoff,],min_length=self.d, lvl1_max_gap=self.opt.tsize,lvl2_max_gap=self.d*4) else: peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,], min_length=self.d, max_gap=self.opt.tsize, call_summits=call_summits) return peaks # def __diag_w_control (self): # # sample # sample_peaks = {} # for i in xrange(90,10,-10): # self.info("#3 diag: sample %d%%" % i) # sample_peaks[i]=self.__diag_peakfinding_w_control_sample(float(i)/(i+10)) # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10) # def __diag_peakfinding_w_control_sample (self, percent): # self.treat.sample(percent) # because sampling is after # # shifting, track.total is used # # now. # self.control.sample(percent) # ratio_treat2control = float(self.treat.total)/self.control.total # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed... # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1 # self.debug("#3 diag: after shift and merging, treat: %d, control: %d" % (self.treat.total,self.control.total)) # self.info("#3 diag: call peak candidates") # peak_candidates = self.__call_peaks_from_trackI (self.treat) # self.info("#3 diag: call negative peak candidates") # negative_peak_candidates = self.__call_peaks_from_trackI (self.control) # self.info("#3 diag: use control data to filter peak candidates...") # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.control, ratio_treat2control) # return final_peaks_percent # def __diag_wo_control (self): # # sample # sample_peaks = {} # for i in xrange(90,10,-10): # self.info("#3 diag: sample %d%%" % i) # sample_peaks[i]=self.__diag_peakfinding_wo_control_sample(float(i)/(i+10)) # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10) # def __diag_peakfinding_wo_control_sample (self, percent): # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed... # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1 # self.treat.sample(percent) # self.debug("#3 diag: after shift and merging, tags: %d" % (self.treat.total)) # self.info("#3 diag: call peak candidates") # peak_candidates = self.__call_peaks_from_trackI (self.treat) # self.info("#3 diag: use self to calculate local lambda and filter peak candidates...") # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) # bug fixed... # return final_peaks_percent # def __overlap (self, gold_peaks, sample_peaks, top=90,bottom=10,step=-10): # """Calculate the overlap between several fe range for the # golden peaks set and results from sampled data. # """ # gp = PeakIO() # gp.init_from_dict(gold_peaks) # if self.femax: # femax = min(self.femax, (int(gp.max_fold_enrichment())//self.festep+1)*self.festep) # else: # femax = (int(gp.max_fold_enrichment())//self.festep+1)*self.festep # femin = self.femin # diag_result = [] # for f in xrange(femin, femax, self.festep): # fe_low = f # fe_up = f + self.festep # self.debug("#3 diag: fe range = %d -- %d" % (fe_low, fe_up)) # r = self.__overlap_fe(gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step) # if r: # diag_result.append(r) # return diag_result # def __overlap_fe (self, gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step): # ret = ["%d-%d" % (fe_low,fe_up)] # gp = PeakIO() # gp.init_from_dict(gold_peaks) # gp.filter_fc(fe_low,fe_up) # gptotal = gp.total() # if gptotal <= 0: # return None # ret.append(gptotal) # for i in xrange(top,bottom,step): # p = PeakIO() # p.init_from_dict(sample_peaks[i]) # percent = 100.0*gp.overlap_with_other_peaks(p)/gptotal # ret.append(percent) # del p # return ret # def __remove_overlapping_peaks (self, peaks ): # """peak_candidates[chrom] = [(peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)...] # """ # new_peaks = {} # chrs = peaks.keys() # chrs.sort() # for chrom in chrs: # new_peaks[chrom]=[] # n_append = new_peaks[chrom].append # prev_peak = None # peaks_chr = peaks[chrom] # for i in xrange(len(peaks_chr)): # if not prev_peak: # prev_peak = peaks_chr[i] # continue # else: # if peaks_chr[i][0] <= prev_peak[1]: # s_new_peak = prev_peak[0] # e_new_peak = peaks_chr[i][1] # l_new_peak = e_new_peak-s_new_peak # if peaks_chr[i][4] > prev_peak[4]: # summit_new_peak = peaks_chr[i][3] # h_new_peak = peaks_chr[i][4] # else: # summit_new_peak = prev_peak[3] # h_new_peak = prev_peak[4] # prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5]) # else: # n_append(prev_peak) # prev_peak = peaks_chr[i] # if prev_peak: # n_append(prev_peak) # return new_peaks