# Time-stamp: <2014-06-17 00:57:31 Tao Liu> """Module for Calculate Scores. Copyright (c) 2013 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: experimental @version: $Revision$ @author: Tao Liu @contact: vladimir.liu@gmail.com """ # ------------------------------------ # python modules # ------------------------------------ import numpy as np cimport numpy as np from array import array as pyarray from collections import Counter from copy import copy, deepcopy from operator import itemgetter from cpython cimport bool #from scipy.signal import fftconvolve from MACS2.cSignal import maxima, enforce_valleys, enforce_peakyness #np_convolve = np.convolve # Experimental #from scipy.stats import chi2 from libc.stdint cimport uint32_t, uint64_t, int32_t, int64_t #from cython.parallel import parallel, prange cimport cython from libc.math cimport log10,log, floor, ceil from MACS2.Constants import BYTE4, FBYTE4, array from MACS2.cProb cimport poisson_cdf from MACS2.IO.cPeakIO import PeakIO, BroadPeakIO, parse_peakname from MACS2.IO.cFixWidthTrack import FWTrackIII from MACS2.IO.cPairedEndTrack import PETrackI from MACS2.Poisson import P_Score_Upper_Tail, LogLR_Asym # pure C code for calculating p-value scores/logLR of Poisson pscore_table = P_Score_Upper_Tail() # this table will cache pscore being calculated. get_pscore = pscore_table.get_pscore logLR_table = LogLR_Asym() # this table will cache pscore being calculated. get_logLR_asym = logLR_table.get_logLR_asym from MACS2.hashtable import Float64HashTable import logging from time import time as ttime # ------------------------------------ # constants # ------------------------------------ __version__ = "scoreCalculate $Revision$" __author__ = "Tao Liu " __doc__ = "scoreTrackI classes" # ------------------------------------ # Misc functions # ------------------------------------ cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b def do_nothing(*args, **kwargs): pass LOG10_E = 0.43429448190325176 cdef inline np.ndarray apply_multiple_cutoffs ( list multiple_score_arrays, list multiple_cutoffs ): cdef: int i np.ndarray ret ret = multiple_score_arrays[0] > multiple_cutoffs[0] for i in range(1,len(multiple_score_arrays)): ret += multiple_score_arrays[i] > multiple_cutoffs[i] return ret cdef inline list get_from_multiple_scores ( list multiple_score_arrays, int index ): cdef: list ret = [] int i for i in range(len(multiple_score_arrays)): ret.append(multiple_score_arrays[i][index]) return ret cdef inline double get_logFE ( float x, float y ): """ return 100* log10 fold enrichment with +1 pseudocount. """ return log10( x/y ) cdef inline float get_subtraction ( float x, float y): """ return subtraction. """ return x - y cdef inline list getitem_then_subtract ( list peakset, int start ): cdef: list a a = map(itemgetter("start"), peakset) for i in range(len(a)): a[i] = str(a[i] - start) return a cdef inline int32_t left_sum ( data, int pos, int width ): """ """ return sum([data[x] for x in data if x <= pos and x >= pos - width]) cdef inline int32_t right_sum ( data, int pos, int width ): """ """ return sum([data[x] for x in data if x >= pos and x <= pos + width]) cdef inline int32_t left_forward ( data, int pos, int window_size ): return data.get(pos,0) - data.get(pos-window_size, 0) cdef inline int32_t right_forward ( data, int pos, int window_size ): return data.get(pos + window_size, 0) - data.get(pos, 0) cdef wtd_find_summit(chrom, np.ndarray plus, np.ndarray minus, int32_t search_start, int32_t search_end, int32_t window_size, float cutoff): """internal function to be called by refine_peak_from_tags_distribution() """ cdef: int32_t i, j, watson_left, watson_right, crick_left, crick_right, wtd_max_pos float wtd_max_val np.ndarray wtd_list, wtd_other_max_pos, wtd_other_max_val watson, crick = (Counter(plus), Counter(minus)) watson_left = left_sum(watson, search_start, window_size) crick_left = left_sum(crick, search_start, window_size) watson_right = right_sum(watson, search_start, window_size) crick_right = right_sum(crick, search_start, window_size) wtd_list = np.zeros( search_end - search_start + 1, dtype="float32") i = 0 for j in range(search_start, search_end+1): wtd_list[i] = max((2 * (watson_left * crick_right)**0.5 - watson_right - crick_left),0) # minimum score is 0 watson_left += left_forward(watson, j, window_size) watson_right += right_forward(watson, j, window_size) crick_left += left_forward(crick, j, window_size) crick_right += right_forward(crick, j, window_size) i += 1 wtd_other_max_pos = maxima(wtd_list, window_size = window_size) wtd_other_max_pos = enforce_peakyness( wtd_list, wtd_other_max_pos ) wtd_other_max_val = wtd_list[wtd_other_max_pos] wtd_other_max_pos = wtd_other_max_pos + search_start return (wtd_other_max_pos, wtd_other_max_val > cutoff) cdef float median_from_value_length ( np.ndarray value, list length ): """ """ cdef: list tmp int32_t l_half, c, tmp_l float tmp_v tmp = sorted(zip( value, length )) l = sum( length )/2 for (tmp_v, tmp_l) in tmp: c += tmp_l if c > l: return tmp_v cdef float mean_from_value_length ( np.ndarray value, list length ): """ """ cdef: list tmp int32_t tmp_l float tmp_v, sum_v tmp = zip( value, length ) l = sum( length ) for (tmp_v, tmp_l) in tmp: sum_v += tmp_v * tmp_l return sum_v / l # ------------------------------------ # Classes # ------------------------------------ cdef class CallerFromAlignments: """A unit to calculate scores and call peaks from alignments -- FWTrackIII or PETrackI objects. It will compute for each chromosome separately in order to save memory usage. """ cdef: object treat # FWTrackIII or PETrackI object for ChIP object ctrl # FWTrackIII or PETrackI object for Control int d # extension size for ChIP list ctrl_d_s # extension sizes for Control. Can be multiple values float treat_scaling_factor # scaling factor for ChIP list ctrl_scaling_factor_s # scaling factor for Control, corresponding to each extension size. float lambda_bg # minimum local bias to fill missing values list chromosomes # name of common chromosomes in ChIP and Control data float pseudocount # the pseudocount used to calcuate logLR, FE or logFE str bedGraph_filename_prefix # prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control #SHIFTCONTROL is obsolete int end_shift # shift of cutting ends before extension bool trackline # whether trackline should be saved in bedGraph bool save_bedGraph # whether to save pileup and local bias in bedGraph files bool save_SPMR # whether to save pileup normalized by sequencing depth in million reads bool no_lambda_flag # whether ignore local bias, and to use global bias instead bool PE_mode # whether it's in PE mode, will be detected during initiation # temporary data buffer str chrom # name of current chromosome list chr_pos_treat_ctrl # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome str bedGraph_treat_filename str bedGraph_control_filename object bedGraph_treat # file handler to write ChIP pileup object bedGraph_ctrl # file handler to write Control pileup # data needed to be pre-computed before peak calling object pqtable # remember pvalue->qvalue convertion bool pvalue_all_done # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value. double test_time def __init__ (self, treat, ctrl, int d = 200, list ctrl_d_s = [200, 1000, 10000], float treat_scaling_factor = 1.0, list ctrl_scaling_factor_s = [1.0, 0.2, 0.02], bool stderr_on = False, float pseudocount = 1.0, int end_shift = 0, float lambda_bg = 0, bool save_bedGraph = False, str bedGraph_filename_prefix = "", str bedGraph_treat_filename = "", str bedGraph_control_filename = "", bool save_SPMR = False): """Initialize. A calculator is unique to each comparison of treat and control. Treat_depth and ctrl_depth should not be changed during calculation. treat and ctrl are two FWTrackIII or PETrackI objects. treat_depth and ctrl_depth are effective depth in million: sequencing depth in million after duplicates being filtered. If treatment is scaled down to control sample size, then this should be control sample size in million. And vice versa. d, sregion, lregion: d is the fragment size, sregion is the small region size, lregion is the large region size pseudocount: a pseudocount used to calculate logLR, FE or logFE. Please note this value will not be changed with normalization method. So if you really want to set pseudocount 1 per million reads, set it after you normalize treat and control by million reads by `change_normalizetion_method(ord('M'))`. """ cdef: set chr1, chr2 if isinstance(treat, FWTrackIII): self.PE_mode = False elif isinstance(treat, PETrackI): self.PE_mode = True else: raise Exception("Should be FWTrackIII or PETrackI object!") self.treat = treat if ctrl: self.ctrl = ctrl else: # while there is no control self.ctrl = treat self.trackline = False self.d = d # note, self.d doesn't make sense in PE mode self.ctrl_d_s = ctrl_d_s# note, self.d doesn't make sense in PE mode self.treat_scaling_factor = treat_scaling_factor self.ctrl_scaling_factor_s= ctrl_scaling_factor_s self.end_shift = end_shift self.lambda_bg = lambda_bg self.pqtable = None self.save_bedGraph = save_bedGraph self.save_SPMR = save_SPMR self.bedGraph_filename_prefix = bedGraph_filename_prefix self.bedGraph_treat_filename = bedGraph_treat_filename self.bedGraph_control_filename = bedGraph_control_filename if not self.ctrl_d_s or not self.ctrl_scaling_factor_s: self.no_lambda_flag = True else: self.no_lambda_flag = False self.pseudocount = pseudocount chr1 = set(self.treat.get_chr_names()) chr2 = set(self.ctrl.get_chr_names()) self.chromosomes = list(chr1.intersection(chr2)) self.test_time = 0 cpdef set_pseudocount( self, float pseudocount ): self.pseudocount = pseudocount cpdef enable_trackline( self ): """Turn on trackline with bedgraph output """ self.trackline = True cdef __pileup_treat_ctrl_a_chromosome ( self, str chrom ): """After this function is called, self.chr_pos_treat_ctrl will be reset and assigned to the pileup values of the given chromosome. """ cdef: list treat_pv, ctrl_pv long i double t assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom # reset or clean existing self.chr_pos_treat_ctrl if self.chr_pos_treat_ctrl: # not a beautiful way to clean self.chr_pos_treat_ctrl[0].resize(10000,refcheck=False) self.chr_pos_treat_ctrl[1].resize(10000,refcheck=False) self.chr_pos_treat_ctrl[2].resize(10000,refcheck=False) self.chr_pos_treat_ctrl[0].resize(0,refcheck=False) self.chr_pos_treat_ctrl[1].resize(0,refcheck=False) self.chr_pos_treat_ctrl[2].resize(0,refcheck=False) #t = ttime() if self.PE_mode: treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 ) else: treat_pv = self.treat.pileup_a_chromosome( chrom, [self.d,], [self.treat_scaling_factor,], baseline_value = 0.0, directional = True, end_shift = self.end_shift ) if not self.no_lambda_flag: if self.PE_mode: ctrl_pv = self.ctrl.pileup_a_chromosome_c( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg ) #ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg ) else: ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg, directional = False ) else: ctrl_pv = [treat_pv[0][-1:], pyarray(FBYTE4,[self.lambda_bg,])] # set a global lambda self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl( treat_pv, ctrl_pv) #self.test_time += ttime() - t # clean treat_pv and ctrl_pv treat_pv = [] ctrl_pv = [] return cdef list __chrom_pair_treat_ctrl ( self, treat_pv, ctrl_pv ): """*private* Pair treat and ctrl pileup for each region. return [p, t, c] list """ cdef: list ret long pre_p, p1, p2, index_ret float v1, v2 p1n = iter(treat_pv[0]).next # assign the next function to a viable to speed up v1n = iter(treat_pv[1]).next p2n = iter(ctrl_pv[0]).next # assign the next function to a viable to speed up v2n = iter(ctrl_pv[1]).next chrom_max_len = len(treat_pv[0])+len(ctrl_pv[0]) ret = [ np.zeros( chrom_max_len, dtype="int32" ), np.zeros( chrom_max_len, dtype="float32" ), np.zeros( chrom_max_len, dtype="float32" ) ] # p, t, c pre_p = 0 index_ret = 0 try: p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() #if v2 == 0: # print p2, v2 while True: if p1 < p2: # clip a region from pre_p to p1, then set pre_p as p1. ret[0][index_ret] = p1 ret[1][index_ret] = v1 ret[2][index_ret] = v2 pre_p = p1 index_ret += 1 # call for the next p1 and v1 p1 = p1n() v1 = v1n() elif p2 < p1: # clip a region from pre_p to p2, then set pre_p as p2. ret[0][index_ret] = p2 ret[1][index_ret] = v1 ret[2][index_ret] = v2 pre_p = p2 index_ret += 1 # call for the next p2 and v2 p2 = p2n() v2 = v2n() elif p1 == p2: # from pre_p to p1 or p2, then set pre_p as p1 or p2. ret[0][index_ret] = p1 ret[1][index_ret] = v1 ret[2][index_ret] = v2 pre_p = p1 index_ret += 1 # call for the next p1, v1, p2, v2. p1 = p1n() v1 = v1n() p2 = p2n() v2 = v2n() except StopIteration: # meet the end of either bedGraphTrackI, simply exit pass ret[0].resize( index_ret, refcheck=False) ret[1].resize( index_ret, refcheck=False) ret[2].resize( index_ret, refcheck=False) return ret cdef np.ndarray __cal_score ( self, array1, array2, cal_func ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = cal_func( array1[i], array2[i] ) return s cdef object __cal_pvalue_qvalue_table ( self ): """After this function is called, self.pqtable is built. All chromosomes will be iterated. So it will take some time. """ cdef: str chrom np.ndarray pos_array, treat_array, ctrl_array dict pvalue_stat = {} long n, pre_p, this_p, length, j, pre_l, l, i double this_v, pre_v, v, q, pre_q, this_t, this_c long N, k, this_l double f long nhcal = 0 long npcal = 0 list unique_values double t0, t1, t logging.debug ( "Start to calculate pvalue stat..." ) t = 0 for chrom in self.chromosomes: pre_p = 0 self.__pileup_treat_ctrl_a_chromosome( chrom ) [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl pn = iter(pos_array).next tn = iter(treat_array).next cn = iter(ctrl_array).next #t0 = ttime() for i in range(pos_array.shape[0]): this_p = pn() this_t = tn() this_c = cn() this_v = get_pscore( int(this_t), this_c ) this_l = this_p - pre_p if pvalue_stat.has_key( this_v ): pvalue_stat[ this_v ] += this_l else: pvalue_stat[ this_v ] = this_l pre_p = this_p #pos_array[ i ] #npcal += pos_array.shape[0] nhcal += pos_array.shape[0] #t1 = ttime() #t += t1 - t0 #t0 = t1 logging.debug ( "make pvalue_stat cost %.5f seconds" % t ) logging.debug ( "calculate pvalue/access hash for %d times" % nhcal ) #logging.debug ( "access hash for %d times" % nhcal ) nhval = 0 #for v in sorted(pvalue_stat.keys()): # if pvalue_stat[v]<=0: # print v, pvalue_stat[v] N = sum(pvalue_stat.values()) # total length k = 1 # rank f = -log10(N) pre_v = -2147483647 pre_l = 0 pre_q = 2147483647 # save the previous q-value #self.pqtable = {} self.pqtable = Float64HashTable() unique_values = sorted(pvalue_stat.keys(), reverse=True) #sorted(unique_values,reverse=True) for i in range(len(unique_values)): v = unique_values[i] l = pvalue_stat[v] #print "v",v,"l",l,"k",k,"f",f q = v + (log10(k) + f) q = max(0,min(pre_q,q)) # make q-score monotonic #self.pqtable.set_item(v, q) self.pqtable[ v ] = q pre_v = v pre_q = q k+=l nhcal += 1 logging.debug( "access pq hash for %d times" % nhcal ) return self.pqtable cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int min_length = 200, int max_gap = 50, bool call_summits = False ): """Call peaks for all chromosomes. Return a PeakIO object. scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q'] score_cutoff_s : cutoff values corresponding to scoring functions min_length : minimum length of peak max_gap : maximum gap of 'insignificant' regions within a peak call_summits : boolean. Whether or not call sub-peaks. save_bedGraph : whether or not to save pileup and control into a bedGraph file """ cdef: str chrom str s peaks = PeakIO() # prepare p-q table if not self.pqtable: logging.info("#3 Pre-compute pvalue-qvalue table...") self.__cal_pvalue_qvalue_table() # prepare bedGraph file if self.save_bedGraph: self.bedGraph_treat = open( self.bedGraph_treat_filename, "w" ) self.bedGraph_ctrl = open( self.bedGraph_control_filename, "w" ) logging.info ("#3 In the peak calling step, the following will be performed simultaneously:") logging.info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_treat_pileup.bdg") logging.info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_control_lambda.bdg") if self.save_SPMR: logging.info ( "#3 --SPMR is requested, so pileup will be normalized by sequencing depth in million reads." ) elif self.treat_scaling_factor == 1: logging.info ( "#3 Pileup will be based on sequencing depth in treatment." ) else: logging.info ( "#3 Pileup will be based on sequencing depth in control." ) if self.trackline: # this line is REQUIRED by the wiggle format for UCSC browser self.bedGraph_treat.write( "track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) self.bedGraph_ctrl.write ( "track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) logging.info("#3 Call peaks for each chromosome...") for chrom in self.chromosomes: self.__chrom_call_peak_using_certain_criteria ( peaks, chrom, scoring_function_symbols, score_cutoff_s, min_length, max_gap, call_summits, self.save_bedGraph ) # close bedGraph file if self.save_bedGraph: self.bedGraph_treat.close() self.bedGraph_ctrl.close() self.save_bedGraph = False #print "test time: %f" % self.test_time return peaks cdef __chrom_call_peak_using_certain_criteria ( self, peaks, str chrom, list scoring_function_s, list score_cutoff_s, int min_length, int max_gap, bool call_summits, bool save_bedGraph ): """ Call peaks for a chromosome. Combination of criteria is allowed here. peaks: a PeakIO object scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup save_bedGraph : whether or not to save pileup and control into a bedGraph file """ cdef: int i str s np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos np.ndarray pos_array, treat_array, ctrl_array np.ndarray above_cutoff_index_array list score_array_s # list to keep different types of scores list peak_content # to store information for a chunk in a peak region, it contains lists of: 1. left position; 2. right position; 3. treatment value; 4. control value; 5. list of scores at this chunk assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!" peak_content = [] # to store points above cutoff # first, build pileup, self.chr_pos_treat_ctrl self.__pileup_treat_ctrl_a_chromosome( chrom ) [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome if save_bedGraph: self.__write_bedGraph_for_a_chromosome ( chrom ) # keep all types of scores needed score_array_s = [] for i in range(len(scoring_function_s)): s = scoring_function_s[i] if s == 'p': score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) ) elif s == 'q': score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) ) elif s == 'f': score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) ) elif s == 's': score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) ) # get the regions with scores above cutoffs above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,score_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff if above_cutoff.size == 0: # nothing above cutoff return peaks if above_cutoff[0] == 0: # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] above_cutoff_startpos[0] = 0 # first bit of region above cutoff peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[0])) ) for i in range( 1,above_cutoff_startpos.size ): if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap: # append peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ) ) else: # close if call_summits: #for x in peak_content: # if x[2] >= 10: # print peak_content # break self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, smoothlen = min_length, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd' else: self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, smoothlen = min_length, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd' peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ), ] # save the last peak if not peak_content: return peaks else: if call_summits: self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, smoothlen = min_length, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd' else: self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, smoothlen = min_length, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd' return peaks cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int min_length, str chrom, int smoothlen=0, list score_cutoff_s=[]): """Close the peak region, output peak boundaries, peak summit and scores, then add the peak to peakIO object. peak_content contains [start, end, treat_p, ctrl_p, list_scores] peaks: a PeakIO object """ cdef: int summit_pos, tstart, tend, tmpindex, summit_index, i, midindex double treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score list tlist_scores_p peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it tsummit = [] summit_pos = 0 summit_value = 0 for i in range(len(peak_content)): (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i] tscore = ttreat_p #self.pqtable[ get_pscore(int(ttreat_p), tctrl_p) ] # use qscore as general score to find summit if not summit_value or summit_value < tscore: tsummit = [(tend + tstart) / 2, ] tsummit_index = [ i, ] summit_value = tscore elif summit_value == tscore: # remember continuous summit values tsummit.append(int((tend + tstart) / 2)) tsummit_index.append( i ) # the middle of all highest points in peak region is defined as summit midindex = int((len(tsummit) + 1) / 2) - 1 summit_pos = tsummit[ midindex ] summit_index = tsummit_index[ midindex ] summit_treat = peak_content[ summit_index ][ 2 ] summit_ctrl = peak_content[ summit_index ][ 3 ] # this is a double-check to see if the summit can pass cutoff values. for i in range(len(score_cutoff_s)): if score_cutoff_s[i] > peak_content[ summit_index ][ 4 ][i]: return False # not passed, then disgard this peak. summit_p_score = get_pscore( int(summit_treat), summit_ctrl ) summit_q_score = self.pqtable[ summit_p_score ] peaks.add( chrom, # chromosome peak_content[0][0], # start peak_content[-1][1], # end summit = summit_pos, # summit position peak_score = summit_q_score, # score at summit pileup = summit_treat, # pileup pscore = summit_p_score, # pvalue fold_change = float ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change qscore = summit_q_score # qvalue ) # start a new peak return True cdef bool __close_peak_with_subpeaks (self, list peak_content, peaks, int min_length, str chrom, int smoothlen=51, list score_cutoff_s=[], float min_valley = 0.9 ): """Algorithm implemented by Ben, to profile the pileup signals within a peak region then find subpeak summits. This method is highly recommended for TFBS or DNAase I sites. """ cdef: int summit_pos, tstart, tend, tmpindex, summit_index, summit_offset int start, end, i, j, start_boundary, m, n, l double summit_value, tvalue, tsummitvalue np.ndarray[np.float32_t, ndim=1] peakdata np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets double ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score list tlist_scores_p peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length < min_length: return # if the region is too small, reject it # Add 10 bp padding to peak region so that we can get true minima end = peak_content[ -1 ][ 1 ] + 10 start = peak_content[ 0 ][ 0 ] - 10 if start < 0: start_boundary = 10 + start # this is the offset of original peak boundary in peakdata list. start = 0 else: start_boundary = 10 # this is the offset of original peak boundary in peakdata list. peakdata = np.zeros(end - start, dtype='float32') # save the scores (qscore) for each position in this region peakindices = np.zeros(end - start, dtype='int32') # save the indices for each position in this region for i in range(len(peak_content)): (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i] #tscore = self.pqtable[ get_pscore(int(ttreat_p), tctrl_p) ] # use qscore as general score to find summit tscore = ttreat_p # use pileup as general score to find summit m = tstart - start + start_boundary n = tend - start + start_boundary peakdata[m:n] = tscore peakindices[m:n] = i summit_offsets = maxima(peakdata, smoothlen) # offsets are the indices for summits in peakdata/peakindices array. #print "maxima:",summit_offsets if summit_offsets.shape[0] == 0: # **failsafe** if no summits, fall back on old approach # return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom) else: # remove maxima that occurred in padding m = np.searchsorted(summit_offsets, start_boundary) n = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right') summit_offsets = summit_offsets[m:n] summit_offsets = enforce_peakyness(peakdata, summit_offsets) #print "enforced:",summit_offsets if summit_offsets.shape[0] == 0: # **failsafe** if no summits, fall back on old approach # return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom) summit_indices = peakindices[summit_offsets] # indices are those point to peak_content summit_offsets -= start_boundary for summit_offset, summit_index in zip(summit_offsets, summit_indices): summit_treat = peak_content[ summit_index ][ 2 ] summit_ctrl = peak_content[ summit_index ][ 3 ] summit_p_score = get_pscore( int(summit_treat), summit_ctrl ) summit_q_score = self.pqtable[ summit_p_score ] for i in range(len(score_cutoff_s)): if score_cutoff_s[i] > peak_content[ summit_index ][ 4 ][i]: return False # not passed, then disgard this summit. peaks.add( chrom, peak_content[ 0 ][ 0 ], peak_content[ -1 ][ 1 ], summit = start + summit_offset, peak_score = summit_q_score, pileup = summit_treat, pscore = summit_p_score, fold_change = float ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change qscore = summit_q_score ) # start a new peak return True cdef np.ndarray __cal_pscore ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = get_pscore( int(array1[i]), array2[i] ) return s cdef np.ndarray __cal_qscore ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = self.pqtable[ get_pscore( int(array1[i]), array2[i] ) ] return s cdef np.ndarray __cal_logLR ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = get_logLR_asym( array1[i] + self.pseudocount, array2[i] + self.pseudocount ) return s cdef np.ndarray __cal_logFE ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = get_logFE( array1[i] + self.pseudocount, array2[i] + self.pseudocount ) return s cdef np.ndarray __cal_FE ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = (array1[i] + self.pseudocount) / ( array2[i] + self.pseudocount ) return s cdef np.ndarray __cal_subtraction ( self, array1, array2 ): cdef: long i assert array1.shape[0] == array2.shape[0] s = np.zeros(array1.shape[0], dtype="float32") for i in range(array1.shape[0]): s[i] = array1[i] - array2[i] return s cdef bool __write_bedGraph_for_a_chromosome ( self, str chrom ): """Write treat/control values for a certain chromosome into a specified file handler. """ cdef: np.ndarray pos_array, treat_array, ctrl_array int l, i int p, pre_p_t, pre_p_c # current position, previous position for treat, previous position for control float pre_v_t, pre_v_c, v_t, v_c # previous value for treat, for control, current value for treat, for control float denominator # 1 if save_SPMR is false, or depth in million if save_SPMR is true. Note, while piling up and calling peaks, treatment and control have been scaled to the same depth, so we need to find what this 'depth' is. [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl if self.save_SPMR: if self.treat_scaling_factor == 1: # in this case, control has been asked to be scaled to depth of treatment denominator = self.treat.total/1e6 else: # in this case, treatment has been asked to be scaled to depth of control denominator = self.ctrl.total/1e6 else: denominator = 1.0 l = pos_array.shape[ 0 ] if l == 0: # if there is no data, return return False # for treat t_write_func = self.bedGraph_treat.write c_write_func = self.bedGraph_ctrl.write pre_p_t = 0 pre_p_c = 0 pre_v_t = treat_array[ 0 ]/denominator pre_v_c = ctrl_array [ 0 ]/denominator for i in range( 1, l ): v_t = treat_array[ i ]/denominator v_c = ctrl_array [ i ]/denominator p = pos_array [ i-1 ] if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) ) pre_v_t = v_t pre_p_t = p if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) ) pre_v_c = v_c pre_p_c = p p = pos_array[ -1 ] # last one t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) ) c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) ) return True cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400): """This function try to find enriched regions within which, scores are continuously higher than a given cutoff for level 1, and link them using the gap above level 2 cutoff with a maximum length of lvl2_max_gap. scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q'] lvl1_cutoff_s: list of cutoffs at highly enriched regions, corresponding to scoring functions. lvl2_cutoff_s: list of cutoffs at less enriched regions, corresponding to scoring functions. min_length : minimum peak length, default 200. lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50. lvl2_max_gap : maximum length of linkage regions, default 400. Return both general PeakIO object for highly enriched regions and gapped broad regions in BroadPeakIO. """ cdef: int i, j str chrom object lvl1peaks, lvl1peakschrom, lvl1 object lvl2peaks, lvl2peakschrom, lvl2 object broadpeaks list chrs, tmppeakset #int tmp_n lvl1peaks = PeakIO() lvl2peaks = PeakIO() # prepare p-q table if not self.pqtable: logging.info("#3 Pre-compute pvalue-qvalue table...") self.__cal_pvalue_qvalue_table() # prepare bedGraph file if self.save_bedGraph: self.bedGraph_treat = open( self.bedGraph_treat_filename, "w" ) self.bedGraph_ctrl = open( self.bedGraph_control_filename, "w" ) logging.info ("#3 In the peak calling step, the following will be performed simultaneously:") logging.info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_treat_pileup.bdg") logging.info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_control_lambda.bdg") if self.trackline: # this line is REQUIRED by the wiggle format for UCSC browser self.bedGraph_treat.write( "track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) self.bedGraph_ctrl.write ( "track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) logging.info("#3 Call peaks for each chromosome...") for chrom in self.chromosomes: self.__chrom_call_broadpeak_using_certain_criteria ( lvl1peaks, lvl2peaks, chrom, scoring_function_symbols, lvl1_cutoff_s, lvl2_cutoff_s, min_length, lvl1_max_gap, lvl2_max_gap, self.save_bedGraph ) # close bedGraph file if self.save_bedGraph: self.bedGraph_treat.close() self.bedGraph_ctrl.close() self.save_bedGraph = False # now combine lvl1 and lvl2 peaks chrs = lvl1peaks.get_chr_names() broadpeaks = BroadPeakIO() # use lvl2_peaks as linking regions between lvl1_peaks for chrom in chrs: tmp_n = 0 lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom) lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom) lvl1peakschrom_next = iter(lvl1peakschrom).next tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region # our assumption is lvl1 regions should be included in lvl2 regions try: lvl1 = lvl1peakschrom_next() for i in range( len(lvl2peakschrom) ): # for each lvl2 peak, find all lvl1 peaks inside # I assume lvl1 peaks can be ALL covered by lvl2 peaks. lvl2 = lvl2peakschrom[i] while True: if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]: tmppeakset.append(lvl1) lvl1 = lvl1peakschrom_next() else: # make a hierarchical broad peak #print lvl2["start"], lvl2["end"], lvl2["score"] self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) #tmp_n += 1 tmppeakset = [] break except StopIteration: # no more strong (aka lvl1) peaks left self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) #tmp_n += 1 tmppeakset = [] # add the rest lvl2 peaks for j in range( i+1, len(lvl2peakschrom) ): self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset ) #tmp_n += 1 #print len(lvl1peakschrom), len(lvl2peakschrom), tmp_n return broadpeaks cdef __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, str chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length, int lvl1_max_gap, int lvl2_max_gap, bool save_bedGraph): """ Call peaks for a chromosome. Combination of criteria is allowed here. peaks: a PeakIO object scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup save_bedGraph : whether or not to save pileup and control into a bedGraph file """ cdef: int i str s np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos np.ndarray pos_array, treat_array, ctrl_array np.ndarray above_cutoff_index_array list score_array_s # list to keep different types of scores list peak_content assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!" assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!" # first, build pileup, self.chr_pos_treat_ctrl self.__pileup_treat_ctrl_a_chromosome( chrom ) [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome if save_bedGraph: self.__write_bedGraph_for_a_chromosome ( chrom ) # keep all types of scores needed score_array_s = [] for i in range(len(scoring_function_s)): s = scoring_function_s[i] if s == 'p': score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) ) elif s == 'q': score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) ) elif s == 'f': score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) ) elif s == 's': score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) ) # lvl1 : strong peaks peak_content = [] # to store points above cutoff # get the regions with scores above cutoffs above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl1_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff if above_cutoff.size == 0: # nothing above cutoff return if above_cutoff[0] == 0: # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] above_cutoff_startpos[0] = 0 # first bit of region above cutoff peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[0])) ) for i in range( 1,above_cutoff_startpos.size ): if above_cutoff_startpos[i] - peak_content[-1][1] <= lvl1_max_gap: # append peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ) ) else: # close self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap/2 ) peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ), ] # save the last peak if peak_content: self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap/2 ) # lvl2 : weak peaks peak_content = [] # to store points above cutoff # get the regions with scores above cutoffs above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl2_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff if above_cutoff.size == 0: # nothing above cutoff return if above_cutoff[0] == 0: # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] above_cutoff_startpos[0] = 0 # first bit of region above cutoff peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[0])) ) for i in range( 1,above_cutoff_startpos.size ): if above_cutoff_startpos[i] - peak_content[-1][1] <= lvl2_max_gap: # append peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ) ) else: # close self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap/2 ) peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ), ] # save the last peak if peak_content: self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap/2 ) return cdef bool __close_peak_for_broad_region (self, list peak_content, peaks, int min_length, str chrom, int smoothlen=0, list score_cutoff_s=[]): """Close the broad peak region, output peak boundaries, peak summit and scores, then add the peak to peakIO object. peak_content contains [start, end, treat_p, ctrl_p, list_scores] peaks: a BroadPeakIO object """ cdef: int summit_pos, tstart, tend, tmpindex, summit_index, i, midindex double treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score list tlist_scores_p, tlist_pileup, tlist_control, tlist_length np.ndarray tarray_pileup, tarray_control, tarray_pscore, tarray_qscore, tarray_fc peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] if peak_length >= min_length: # if the peak is too small, reject it tlist_pileup = [] tlist_control= [] tlist_length = [] for i in range(len(peak_content)): # each position in broad peak (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i] #tscore = ttreat_p #self.pqtable[ get_pscore(int(ttreat_p), tctrl_p) ] # use qscore as general score to find summit tlist_pileup.append( ttreat_p ) tlist_control.append( tctrl_p ) tlist_length.append( tend - tstart ) tarray_pileup = np.array( tlist_pileup, dtype="float32") tarray_control = np.array( tlist_control, dtype="float32") tarray_pscore = self.__cal_pscore( tarray_pileup, tarray_control ) tarray_qscore = self.__cal_qscore( tarray_pileup, tarray_control ) tarray_fc = self.__cal_FE ( tarray_pileup, tarray_control ) peaks.add( chrom, # chromosome peak_content[0][0], # start peak_content[-1][1], # end summit = 0, peak_score = mean_from_value_length( tarray_qscore, tlist_length ), pileup = mean_from_value_length( tarray_pileup, tlist_length ), pscore = mean_from_value_length( tarray_pscore, tlist_length ), fold_change = mean_from_value_length( tarray_fc, tlist_length ), qscore = mean_from_value_length( tarray_qscore, tlist_length ), ) # start a new peak return True cdef __add_broadpeak (self, bpeaks, str chrom, object lvl2peak, list lvl1peakset): """Internal function to create broad peak. *Note* lvl1peakset/strong_regions might be empty """ cdef: int blockNum, start, end str blockSizes, blockStarts, thickStart, thickEnd, #print lvl2peak["start"], lvl2peak["end"], lvl2peak["score"] start = lvl2peak["start"] end = lvl2peak["end"] if not lvl1peakset: #try: # will complement by adding 1bps start and end to this region # may change in the future if gappedPeak format was improved. bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=str(start), thickEnd=str(end), blockNum = 2, blockSizes = "1,1", blockStarts = "0,"+str(end-start-1), pileup = lvl2peak["pileup"], pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"], qscore = lvl2peak["qscore"] ) #except: # print [ chrom, start, end, lvl2peak["score"],".", ".", # 0, ".", ".", lvl2peak["pileup"], # lvl2peak["pscore"], lvl2peak["fc"], # lvl2peak["qscore"] ] # raise Exception("quit") return bpeaks thickStart = str(lvl1peakset[0]["start"]) thickEnd = str(lvl1peakset[-1]["end"]) blockNum = int(len(lvl1peakset)) blockSizes = ",".join(map(str,map(itemgetter("length"),lvl1peakset))) #join( map(lambda x:str(x["length"]),lvl1peakset) ) blockStarts = ",".join(getitem_then_subtract(lvl1peakset, start)) #join( map(lambda x:str(x["start"]-start),lvl1peakset) ) # add 1bp left and/or right block if necessary if int(thickStart) != start: # add 1bp left block thickStart = str(start) blockNum += 1 blockSizes = "1,"+blockSizes blockStarts = "0,"+blockStarts if int(thickEnd) != end: # add 1bp right block thickEnd = str(end) blockNum += 1 blockSizes = blockSizes+",1" blockStarts = blockStarts+","+str(end-start-1) bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd, blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts, pileup = lvl2peak["pileup"], pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"], qscore = lvl2peak["qscore"] ) return bpeaks cpdef refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ): """Extract tags in peak, then apply func on extracted tags. peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx. window_size: this will be passed to func. cutoff: this will be passed to func. func needs the fixed number of parameters, so it's not flexible. Here is an example: wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff): """ cdef: int32_t c, m, i, j, pre_i, pre_j, pos, startpos, endpos np.ndarray plus, minus, rt_plus, rt_minus str chrom list temp, retval, pchrnames, cpeaks np.ndarray adjusted_summits, passflags if self.PE_mode: return None pchrnames = sorted(peaks.get_chr_names()) retval = [] # this object should be sorted if not self.treat.__sorted: self.treat.sort() # PeakIO object should be sorted peaks.sort() chrnames = self.treat.get_chr_names() ret_peaks = PeakIO() for c in range(len(pchrnames)): chrom = pchrnames[c] assert chrom in chrnames, "chromosome %s can't be found in the FWTrackIII object. %s" % (chrom, str(chrnames)) (plus, minus) = self.treat.__locations[chrom] cpeaks = peaks.get_data_from_chrom(chrom) prev_i = 0 prev_j = 0 for m in range(len(cpeaks)): thispeak = cpeaks[m] startpos = thispeak["start"] - window_size endpos = thispeak["end"] + window_size temp = [] for i in range(prev_i,plus.shape[0]): pos = plus[i] if pos < startpos: continue elif pos > endpos: prev_i = i break else: temp.append(pos) rt_plus = np.array(temp) temp = [] for j in range(prev_j,minus.shape[0]): pos = minus[j] if pos < startpos: continue elif pos > endpos: prev_j = j break else: temp.append(pos) rt_minus = np.array(temp) (adjusted_summits, passflags) = wtd_find_summit(chrom, rt_plus, rt_minus, startpos, endpos, window_size, cutoff) # those local maxima above cutoff will be defined as good summits for i in range(len(adjusted_summits)): adjusted_summit = adjusted_summits[i] passflag = passflags[i] if passflag: tmppeak = copy(thispeak) tmppeak["summit"] = adjusted_summit ret_peaks.add_PeakContent(chrom, tmppeak) # rewind window_size for i in range(prev_i, 0, -1): if plus[prev_i] - plus[i] >= window_size: break prev_i = i for j in range(prev_j, 0, -1): if minus[prev_j] - minus[j] >= window_size: break prev_j = j # end of a loop return ret_peaks ##### not fully implemented ##### # cdef class CallerFromTwoPileups: # """A unit to calculate scores and call peaks from two BedGraphTrackI object. # It will compute for each chromosome separately in order to save # memory usage. # """ # cdef: # object treat # BedGraphTrackI object for ChIP # object ctrl # BedGraphTrackI object for Control # list chromosomes # name of common chromosomes in ChIP and Control data # float pseudocount # the pseudocount used to calcuate logLR, FE or logFE # str bedGraph_filename_prefix # prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control # bool trackline # whether trackline should be saved in bedGraph # bool save_bedGraph # whether to save scores in bedGraph files # # temporary data buffer # str chrom # name of current chromosome # list chr_pos_treat_ctrl # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome # object bedGraph_score # file handler to write scores # # data needed to be pre-computed before peak calling # object pqtable # remember pvalue->qvalue convertion # bool pvalue_all_done # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value. # def __init__ (self, treat, ctrl, # int d = 200, list ctrl_d_s = [200, 1000, 10000], # float treat_scaling_factor = 1.0, list ctrl_scaling_factor_s = [1.0, 0.2, 0.02], # bool stderr_on = False, # float pseudocount = 1.0, # bool halfextension = False, # float lambda_bg = 0, # bool shiftcontrol = False, # bool save_bedGraph = False, # str bedGraph_filename_prefix = ""): # """Initialize. # A calculator is unique to each comparison of treat and # control. Treat_depth and ctrl_depth should not be changed # during calculation. # treat and ctrl are two FWTrackIII or PETrackI objects. # treat_depth and ctrl_depth are effective depth in million: # sequencing depth in million after # duplicates being filtered. If # treatment is scaled down to # control sample size, then this # should be control sample size in # million. And vice versa. # d, sregion, lregion: d is the fragment size, sregion is the # small region size, lregion is the large # region size # pseudocount: a pseudocount used to calculate logLR, FE or # logFE. Please note this value will not be changed # with normalization method. So if you really want # to set pseudocount 1 per million reads, set it # after you normalize treat and control by million # reads by `change_normalizetion_method(ord('M'))`. # """ # cdef: # set chr1, chr2 # if isinstance(treat, FWTrackIII): # self.PE_mode = False # elif isinstance(ctrl, PETrackI): # self.PE_mode = True # else: # raise Exception("Should be FWTrackIII or PETrackI object!") # self.treat = treat # if ctrl: # self.ctrl = ctrl # else: # while there is no control # self.ctrl = treat # self.trackline = False # self.treat_edm = treat.total / 1000000.0 # self.ctrl_edm = ctrl.total / 10000000.0 # self.d = d # note, self.d doesn't make sense in PE mode # self.ctrl_d_s = ctrl_d_s# note, self.d doesn't make sense in PE mode # self.treat_scaling_factor = treat_scaling_factor # self.ctrl_scaling_factor_s= ctrl_scaling_factor_s # self.halfextension = halfextension # self.shiftcontrol = shiftcontrol # self.lambda_bg = lambda_bg # self.pqtable = None # self.save_bedGraph = save_bedGraph # self.bedGraph_filename_prefix = bedGraph_filename_prefix # if not self.ctrl_d_s or not self.ctrl_scaling_factor_s: # self.no_lambda_flag = True # else: # self.no_lambda_flag = False # self.pseudocount = pseudocount # chr1 = set(self.treat.get_chr_names()) # chr2 = set(self.ctrl.get_chr_names()) # self.chromosomes = list(chr1.intersection(chr2)) # cpdef set_pseudocount( self, float pseudocount ): # self.pseudocount = pseudocount # cpdef enable_trackline( self ): # """Turn on trackline with bedgraph output # """ # self.trackline = True # cdef __pileup_treat_ctrl_a_chromosome ( self, str chrom ): # """After this function is called, self.chr_pos_treat_ctrl will # be reset and assigned to the pileup values of the given # chromosome. # """ # cdef: # list treat_pv, ctrl_pv # long i # assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom # # reset or clean existing self.chr_pos_treat_ctrl # if self.chr_pos_treat_ctrl: # not a beautiful way to clean # self.chr_pos_treat_ctrl[0].resize(10000,refcheck=False) # self.chr_pos_treat_ctrl[1].resize(10000,refcheck=False) # self.chr_pos_treat_ctrl[2].resize(10000,refcheck=False) # self.chr_pos_treat_ctrl[0].resize(0,refcheck=False) # self.chr_pos_treat_ctrl[1].resize(0,refcheck=False) # self.chr_pos_treat_ctrl[2].resize(0,refcheck=False) # if self.PE_mode: # treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 ) # else: # treat_pv = self.treat.pileup_a_chromosome( chrom, [self.d,], [self.treat_scaling_factor,], baseline_value = 0.0, # directional = True, halfextension = self.halfextension ) # if not self.no_lambda_flag: # if self.PE_mode: # ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg ) # else: # ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, # baseline_value = self.lambda_bg, # directional = self.shiftcontrol, # halfextension = self.halfextension ) # else: # ctrl_pv = [treat_pv[0][-1:], pyarray(FBYTE4,[self.lambda_bg,])] # set a global lambda # self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl( treat_pv, ctrl_pv) # # clean treat_pv and ctrl_pv # treat_pv = [] # ctrl_pv = [] # return # cdef list __chrom_pair_treat_ctrl ( self, treat_pv, ctrl_pv ): # """*private* Pair treat and ctrl pileup for each region. # return [p, t, c] list # """ # cdef: # list ret # long pre_p, p1, p2, index_ret # float v1, v2 # p1n = iter(treat_pv[0]).next # assign the next function to a viable to speed up # v1n = iter(treat_pv[1]).next # p2n = iter(ctrl_pv[0]).next # assign the next function to a viable to speed up # v2n = iter(ctrl_pv[1]).next # chrom_max_len = len(treat_pv[0])+len(ctrl_pv[0]) # ret = [ np.zeros( chrom_max_len, dtype="int32" ), # np.zeros( chrom_max_len, dtype="float32" ), # np.zeros( chrom_max_len, dtype="float32" ) ] # p, t, c # pre_p = 0 # index_ret = 0 # try: # p1 = p1n() # v1 = v1n() # p2 = p2n() # v2 = v2n() # #if v2 == 0: # # print p2, v2 # while True: # if p1 < p2: # # clip a region from pre_p to p1, then set pre_p as p1. # ret[0][index_ret] = p1 # ret[1][index_ret] = v1 # ret[2][index_ret] = v2 # pre_p = p1 # index_ret += 1 # # call for the next p1 and v1 # p1 = p1n() # v1 = v1n() # elif p2 < p1: # # clip a region from pre_p to p2, then set pre_p as p2. # ret[0][index_ret] = p2 # ret[1][index_ret] = v1 # ret[2][index_ret] = v2 # pre_p = p2 # index_ret += 1 # # call for the next p2 and v2 # p2 = p2n() # v2 = v2n() # elif p1 == p2: # # from pre_p to p1 or p2, then set pre_p as p1 or p2. # ret[0][index_ret] = p1 # ret[1][index_ret] = v1 # ret[2][index_ret] = v2 # pre_p = p1 # index_ret += 1 # # call for the next p1, v1, p2, v2. # p1 = p1n() # v1 = v1n() # p2 = p2n() # v2 = v2n() # except StopIteration: # # meet the end of either bedGraphTrackI, simply exit # pass # ret[0].resize( index_ret, refcheck=False) # ret[1].resize( index_ret, refcheck=False) # ret[2].resize( index_ret, refcheck=False) # return ret # cdef np.ndarray __cal_score ( self, array1, array2, cal_func ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = cal_func( array1[i], array2[i] ) # return s # cdef object __cal_pvalue_qvalue_table ( self ): # """After this function is called, self.pqtable is built. All # chromosomes will be iterated. So it will take some time. # """ # cdef: # str chrom # np.ndarray pos_array, treat_array, ctrl_array # dict pvalue_stat = {} # long n, pre_p, this_p, length, j, pre_l, l, i # double this_v, pre_v, v, q, pre_q # long N, k # double f # long nhcal = 0 # long npcal = 0 # list unique_values # double t0, t1, t # logging.debug ( "Start to calculate pvalue stat..." ) # t = 0 # for chrom in self.chromosomes: # prev_p = 0 # #t0 = ttime() # self.__pileup_treat_ctrl_a_chromosome( chrom ) # [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # #t1 = ttime() # #t += t1 - t0 # #t0 = t1 # for i in range(pos_array.shape[0]): # #if ctrl_array[i] == 0: # # print "cal c p:", chrom, i, pos_array[i], treat_array[i], ctrl_array[i] # this_v = get_pscore( int(treat_array[i]), ctrl_array[i] ) # npcal += 1 # if pvalue_stat.has_key(this_v): # pvalue_stat[ this_v ] += pos_array[ i ] - prev_p # nhcal += 1 # else: # pvalue_stat[ this_v ] = pos_array[ i ] - prev_p # nhcal += 1 # #print pos_array[ i ], prev_p, pos_array[ i ] - prev_p # prev_p = pos_array[ i ] # logging.debug ( "make pvalue_stat cost %.5f seconds" % t ) # logging.debug ( "calculate pvalue for %d times" % npcal ) # logging.debug ( "access hash for %d times" % nhcal ) # nhval = 0 # N = sum(pvalue_stat.values()) # total length # k = 1 # rank # f = -log10(N) # pre_v = -2147483647 # pre_l = 0 # pre_q = 2147483647 # save the previous q-value # #self.pqtable = {} # self.pqtable = Float64HashTable() # unique_values = sorted(pvalue_stat.keys(), reverse=True) #sorted(unique_values,reverse=True) # for i in range(len(unique_values)): # v = unique_values[i] # l = pvalue_stat[v] # q = v + (log10(k) + f) # q = max(0,min(pre_q,q)) # make q-score monotonic # #self.pqtable.set_item(v, q) # self.pqtable[ v ] = q # pre_v = v # pre_q = q # k+=l # nhcal += 1 # logging.debug( "access pq hash for %d times" % nhcal ) # return self.pqtable # cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int min_length = 200, # int max_gap = 50, bool call_summits = False ): # """Call peaks for all chromosomes. Return a PeakIO object. # scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q'] # score_cutoff_s : cutoff values corresponding to scoring functions # min_length : minimum length of peak # max_gap : maximum gap of 'insignificant' regions within a peak # call_summits : boolean. Whether or not call sub-peaks. # save_bedGraph : whether or not to save pileup and control into a bedGraph file # """ # cdef: # str chrom # str s # peaks = PeakIO() # # prepare p-q table # if not self.pqtable: # logging.info("#3 Pre-compute pvalue-qvalue table...") # self.__cal_pvalue_qvalue_table() # # prepare bedGraph file # if self.save_bedGraph: # self.bedGraph_treat = open( self.bedGraph_filename_prefix + "_treat_pileup.bdg", "w" ) # self.bedGraph_ctrl = open( self.bedGraph_filename_prefix + "_control_lambda.bdg", "w" ) # logging.info ("#3 In the peak calling step, the following will be performed simultaneously:") # logging.info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_treat_pileup.bdg") # logging.info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_control_lambda.bdg") # if self.trackline: # # this line is REQUIRED by the wiggle format for UCSC browser # self.bedGraph_treat.write( "track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) # self.bedGraph_ctrl.write ( "track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) # logging.info("#3 Call peaks for each chromosome...") # for chrom in self.chromosomes: # self.__chrom_call_peak_using_certain_criteria ( peaks, chrom, scoring_function_symbols, score_cutoff_s, min_length, max_gap, call_summits, self.save_bedGraph ) # # close bedGraph file # if self.save_bedGraph: # self.bedGraph_treat.close() # self.bedGraph_ctrl.close() # self.save_bedGraph = False # return peaks # cdef __chrom_call_peak_using_certain_criteria ( self, peaks, str chrom, list scoring_function_s, list score_cutoff_s, int min_length, # int max_gap, bool call_summits, bool save_bedGraph): # """ Call peaks for a chromosome. # Combination of criteria is allowed here. # peaks: a PeakIO object # scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup # save_bedGraph : whether or not to save pileup and control into a bedGraph file # """ # cdef: # int i # str s # np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos # np.ndarray pos_array, treat_array, ctrl_array # np.ndarray above_cutoff_index_array # list score_array_s # list to keep different types of scores # list peak_content # to store information for a chunk in a peak region, it contains lists of: 1. left position; 2. right position; 3. treatment value; 4. control value; 5. list of scores at this chunk # assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!" # peak_content = [] # to store points above cutoff # # first, build pileup, self.chr_pos_treat_ctrl # self.__pileup_treat_ctrl_a_chromosome( chrom ) # [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome # if save_bedGraph: # self.__write_bedGraph_for_a_chromosome ( chrom ) # # keep all types of scores needed # score_array_s = [] # for i in range(len(scoring_function_s)): # s = scoring_function_s[i] # if s == 'p': # score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) ) # elif s == 'q': # score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) ) # elif s == 'f': # score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) ) # elif s == 's': # score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) ) # # get the regions with scores above cutoffs # above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,score_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? # above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices # above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff # above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff # if above_cutoff.size == 0: # # nothing above cutoff # return peaks # if above_cutoff[0] == 0: # # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] # above_cutoff_startpos[0] = 0 # # first bit of region above cutoff # peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i])) ) # for i in range( 1,above_cutoff_startpos.size ): # if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap: # # append # peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ) ) # else: # # close # if call_summits: # self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, max_gap/2 ) # else: # self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, max_gap/2 ) # peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], get_from_multiple_scores( score_array_s, above_cutoff_index_array[i]) ), ] # # save the last peak # if not peak_content: # return peaks # else: # if call_summits: # self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, max_gap/2, score_cutoff_s ) # smooth length is 1/2 max-gap # else: # self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, max_gap/2, score_cutoff_s ) # smooth length is 1/2 max-gap # return peaks # cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int min_length, # str chrom, int smoothlen=0, list score_cutoff_s=[]): # """Close the peak region, output peak boundaries, peak summit # and scores, then add the peak to peakIO object. # peak_content contains [start, end, treat_p, ctrl_p, list_scores] # peaks: a PeakIO object # """ # cdef: # int summit_pos, tstart, tend, tmpindex, summit_index, i, midindex # double treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score # list tlist_scores_p # peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] # if peak_length >= min_length: # if the peak is too small, reject it # tsummit = [] # summit_pos = 0 # summit_value = 0 # for i in range(len(peak_content)): # (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i] # tscore = ttreat_p #self.pqtable[ get_pscore(int(ttreat_p), tctrl_p) ] # use qscore as general score to find summit # if not summit_value or summit_value < tscore: # tsummit = [(tend + tstart) / 2, ] # tsummit_index = [ i, ] # summit_value = tscore # elif summit_value == tscore: # # remember continuous summit values # tsummit.append(int((tend + tstart) / 2)) # tsummit_index.append( i ) # # the middle of all highest points in peak region is defined as summit # midindex = int((len(tsummit) + 1) / 2) - 1 # summit_pos = tsummit[ midindex ] # summit_index = tsummit_index[ midindex ] # summit_treat = peak_content[ summit_index ][ 2 ] # summit_ctrl = peak_content[ summit_index ][ 3 ] # # this is a double-check to see if the summit can pass cutoff values. # for i in range(len(score_cutoff_s)): # if score_cutoff_s[i] > peak_content[ summit_index ][ 4 ][i]: # return False # not passed, then disgard this peak. # summit_p_score = get_pscore( int(summit_treat), summit_ctrl ) # summit_q_score = self.pqtable[ summit_p_score ] # peaks.add( chrom, # chromosome # peak_content[0][0], # start # peak_content[-1][1], # end # summit = summit_pos, # summit position # peak_score = summit_q_score, # score at summit # pileup = summit_treat, # pileup # pscore = summit_p_score, # pvalue # fold_change = float ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change # qscore = summit_q_score # qvalue # ) # # start a new peak # return True # cdef bool __close_peak_with_subpeaks (self, list peak_content, peaks, int min_length, # str chrom, int smoothlen=51, list score_cutoff_s=[], # float min_valley = 0.9 ): # """Algorithm implemented by Ben, to profile the pileup signals # within a peak region then find subpeak summits. This method is # highly recommended for TFBS or DNAase I sites. # """ # cdef: # int summit_pos, tstart, tend, tmpindex, summit_index, summit_offset # int start, end, i, j, start_boundary, m, n # double summit_value, tvalue, tsummitvalue # np.ndarray[np.float32_t, ndim=1] peakdata # np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets # double ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score # list tlist_scores_p # # Add 10 bp padding to peak region so that we can get true minima # end = peak_content[ -1 ][ 1 ] + 10 # start = peak_content[ 0 ][ 0 ] - 10 # if start < 0: # start_boundary = 10 + start # start = 0 # else: # start_boundary = 10 # peak_length = end - start # if end - start < min_length: return # if the region is too small, reject it # peakdata = np.zeros(end - start, dtype='float32') # save the scores (qscore) for each position in this region # peakindices = np.zeros(end - start, dtype='int32') # save the indices for each position in this region # for i in range(len(peak_content)): # (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i] # #tscore = self.pqtable[ get_pscore(int(ttreat_p), tctrl_p) ] # use qscore as general score to find summit # tscore = ttreat_p # use pileup as general score to find summit # m = tstart - start + start_boundary # n = tend - start + start_boundary # peakdata[m:n] = tscore # peakindices[m:n] = i # summit_offsets = maxima(peakdata, smoothlen) # offsets are the indices for summits in peakdata/peakindices array. # if summit_offsets.shape[0] == 0: # # **failsafe** if no summits, fall back on old approach # # return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom) # else: # # remove maxima that occurred in padding # m = np.searchsorted(summit_offsets, start_boundary) # n = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right') # summit_offsets = summit_offsets[m:n] # summit_offsets = enforce_peakyness(peakdata, summit_offsets) # if summit_offsets.shape[0] == 0: # # **failsafe** if no summits, fall back on old approach # # return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom) # summit_indices = peakindices[summit_offsets] # indices are those point to peak_content # summit_offsets -= start_boundary # for summit_offset, summit_index in zip(summit_offsets, summit_indices): # summit_treat = peak_content[ summit_index ][ 2 ] # summit_ctrl = peak_content[ summit_index ][ 3 ] # summit_p_score = get_pscore( int(summit_treat), summit_ctrl ) # summit_q_score = self.pqtable[ summit_p_score ] # for i in range(len(score_cutoff_s)): # if score_cutoff_s[i] > peak_content[ summit_index ][ 4 ][i]: # return False # not passed, then disgard this summit. # peaks.add( chrom, # start, # end, # summit = start + summit_offset, # peak_score = summit_q_score, # pileup = summit_treat, # pscore = summit_p_score, # fold_change = float ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change # qscore = summit_q_score # ) # # start a new peak # return True # cdef np.ndarray __cal_pscore ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = get_pscore( int(array1[i]), array2[i] ) # return s # cdef np.ndarray __cal_qscore ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = self.pqtable[ get_pscore( int(array1[i]), array2[i] ) ] # return s # cdef np.ndarray __cal_logLR ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = logLR( array1[i] + self.pseudocount, array2[i] + self.pseudocount ) # return s # cdef np.ndarray __cal_logFE ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = get_logFE( array1[i] + self.pseudocount, array2[i] + self.pseudocount ) # return s # cdef np.ndarray __cal_FE ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = (array1[i] + self.pseudocount) / ( array2[i] + self.pseudocount ) # return s # cdef np.ndarray __cal_subtraction ( self, array1, array2 ): # cdef: # long i # assert array1.shape[0] == array2.shape[0] # s = np.zeros(array1.shape[0], dtype="float32") # for i in range(array1.shape[0]): # s[i] = array1[i] - array2[i] # return s # cdef bool __write_bedGraph_for_a_chromosome ( self, str chrom ): # """Write treat/control values for a certain chromosome into a # specified file handler. # """ # cdef: # np.ndarray pos_array, treat_array, ctrl_array # int l, i # int p, pre_p_t, pre_p_c # current position, previous position for treat, previous position for control # float pre_v_t, pre_v_c, v_t, v_c # previous value for treat, for control, current value for treat, for control # [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # l = pos_array.shape[ 0 ] # if l == 0: # if there is no data, return # return False # # for treat # t_write_func = self.bedGraph_treat.write # c_write_func = self.bedGraph_ctrl.write # pre_p_t = 0 # pre_p_c = 0 # pre_v_t = treat_array[ 0 ] # pre_v_c = ctrl_array [ 0 ] # for i in range( 1, l ): # v_t = treat_array[ i ] # v_c = ctrl_array [ i ] # p = pos_array [ i-1 ] # if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits # t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) ) # pre_v_t = v_t # pre_p_t = p # if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits # c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) ) # pre_v_c = v_c # pre_p_c = p # p = pos_array[ -1 ] # # last one # t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) ) # c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) ) # return True # cpdef tuple call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400): # """This function try to find enriched regions within which, # scores are continuously higher than a given cutoff for level # 1, and link them using the gap above level 2 cutoff with a # maximum length of lvl2_max_gap. # scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q'] # lvl1_cutoff_s: list of cutoffs at highly enriched regions, corresponding to scoring functions. # lvl2_cutoff_s: list of cutoffs at less enriched regions, corresponding to scoring functions. # min_length : minimum peak length, default 200. # lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50. # lvl2_max_gap : maximum length of linkage regions, default 400. # Return both general PeakIO object for highly enriched regions # and gapped broad regions in BroadPeakIO. # """ # cdef: # int i # str chrom # object lvl1peaks, lvl1peakschrom, lvl1 # object lvl2peaks, lvl2peakschrom, lvl2 # object broadpeaks # list chrs, tmppeakset # lvl1peaks = PeakIO() # lvl2peaks = PeakIO() # # prepare p-q table # if not self.pqtable: # logging.info("#3 Pre-compute pvalue-qvalue table...") # self.__cal_pvalue_qvalue_table() # # prepare bedGraph file # if self.save_bedGraph: # self.bedGraph_treat = open( self.bedGraph_filename_prefix + "_treat_pileup.bdg", "w" ) # self.bedGraph_ctrl = open( self.bedGraph_filename_prefix + "_control_lambda.bdg", "w" ) # logging.info ("#3 In the peak calling step, the following will be performed simultaneously:") # logging.info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_treat_pileup.bdg") # logging.info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix + "_control_lambda.bdg") # if self.trackline: # # this line is REQUIRED by the wiggle format for UCSC browser # self.bedGraph_treat.write( "track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) # self.bedGraph_ctrl.write ( "track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix ) # logging.info("#3 Call peaks for each chromosome...") # for chrom in self.chromosomes: # self.__chrom_call_broadpeak_using_certain_criteria ( lvl1peaks, lvl2peaks, chrom, scoring_function_symbols, lvl1_cutoff_s, lvl2_cutoff_s, min_length, lvl1_max_gap, lvl2_max_gap, self.save_bedGraph ) # # close bedGraph file # if self.save_bedGraph: # self.bedGraph_treat.close() # self.bedGraph_ctrl.close() # self.save_bedGraph = False # # now combine lvl1 and lvl2 peaks # chrs = lvl1peaks.get_chr_names() # broadpeaks = BroadPeakIO() # # use lvl2_peaks as linking regions between lvl1_peaks # for chrom in chrs: # lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom) # lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom) # lvl1peakschrom_next = iter(lvl1peakschrom).next # tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region # # our assumption is lvl1 regions should be included in lvl2 regions # try: # lvl1 = lvl1peakschrom_next() # except StopIteration: # break # for i in range( len(lvl2peakschrom) ): # # for each lvl2 peak, find all lvl1 peaks inside # lvl2 = lvl2peakschrom[i] # try: # while True: # if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]: # tmppeakset.append(lvl1) # else: # if tmppeakset: # self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) # tmppeakset = [] # break # lvl1 = lvl1peakschrom_next() # except StopIteration: # if tmppeakset: # self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset) # break # return lvl1peaks, broadpeaks # cdef __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, str chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s, # int min_length, int lvl1_max_gap, int lvl2_max_gap, bool save_bedGraph): # """ Call peaks for a chromosome. # Combination of criteria is allowed here. # peaks: a PeakIO object # scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup # save_bedGraph : whether or not to save pileup and control into a bedGraph file # """ # cdef: # int i # str s # np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos # np.ndarray pos_array, treat_array, ctrl_array # np.ndarray above_cutoff_index_array # list score_array_s # list to keep different types of scores # list peak_content # assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!" # assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!" # peak_content = [] # to store points above cutoff # # first, build pileup, self.chr_pos_treat_ctrl # self.__pileup_treat_ctrl_a_chromosome( chrom ) # [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl # # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome # if save_bedGraph: # self.__write_bedGraph_for_a_chromosome ( chrom ) # # keep all types of scores needed # score_array_s = [] # for i in range(len(scoring_function_s)): # s = scoring_function_s[i] # if s == 'p': # score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) ) # elif s == 'q': # score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) ) # elif s == 'f': # score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) ) # elif s == 's': # score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) ) # # lvl1 : strong peaks # # get the regions with scores above cutoffs # above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl1_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? # above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices # above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff # above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff # if above_cutoff.size == 0: # # nothing above cutoff # return # if above_cutoff[0] == 0: # # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] # above_cutoff_startpos[0] = 0 # # first bit of region above cutoff # peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]]) ) # for i in range( 1,above_cutoff_startpos.size ): # if above_cutoff_startpos[i] - peak_content[-1][1] <= lvl1_max_gap: # # append # peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]] ) ) # else: # # close # self.__close_peak_wo_subpeaks (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap/2 ) # peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]] ), ] # # save the last peak # if peak_content: # self.__close_peak_wo_subpeaks (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap/2 ) # # lvl2 : weak peaks # # get the regions with scores above cutoffs # above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl2_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray? # above_cutoff_index_array = np.arange(pos_array.shape[0])[above_cutoff] # indices # above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff # above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff # if above_cutoff.size == 0: # # nothing above cutoff # return # if above_cutoff[0] == 0: # # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos'] # above_cutoff_startpos[0] = 0 # # first bit of region above cutoff # peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]]) ) # for i in range( 1,above_cutoff_startpos.size ): # if above_cutoff_startpos[i] - peak_content[-1][1] <= lvl2_max_gap: # # append # peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]] ) ) # else: # # close # self.__close_peak_wo_subpeaks (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap/2 ) # peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]] ), ] # # save the last peak # if peak_content: # self.__close_peak_wo_subpeaks (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap/2 ) # return # cdef __add_broadpeak (self, bpeaks, str chrom, object lvl2peak, list lvl1peakset): # """Internal function to create broad peak. # """ # cdef: # int blockNum, thickStart, thickEnd, start, end # str blockSizes, blockStarts # start = lvl2peak["start"] # end = lvl2peak["end"] # thickStart = lvl1peakset[0]["start"] # thickEnd = lvl1peakset[-1]["end"] # blockNum = int(len(lvl1peakset)) # blockSizes = ",".join(map(str,map(itemgetter("length"),lvl1peakset))) #join( map(lambda x:str(x["length"]),lvl1peakset) ) # blockStarts = ",".join(getitem_then_subtract(lvl1peakset, start)) #join( map(lambda x:str(x["start"]-start),lvl1peakset) ) # #print blockSizes, blockStarts # if lvl2peak["start"] != thickStart: # # add 1bp mark for the start of lvl2 peak # blockNum += 1 # blockSizes = "1,"+blockSizes # blockStarts = "0,"+blockStarts # if lvl2peak["end"] != thickEnd: # # add 1bp mark for the end of lvl2 peak # blockNum += 1 # blockSizes = blockSizes+",1" # blockStarts = blockStarts+","+str(end-start-1) # bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd, # blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts) # return bpeaks # cpdef refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ): # """Extract tags in peak, then apply func on extracted tags. # peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx. # window_size: this will be passed to func. # cutoff: this will be passed to func. # func needs the fixed number of parameters, so it's not flexible. Here is an example: # wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff): # """ # cdef: # int32_t c, m, i, j, pre_i, pre_j, pos, startpos, endpos # np.ndarray plus, minus, rt_plus, rt_minus # str chrom # list temp, retval, pchrnames, cpeaks # np.ndarray adjusted_summits, passflags # if self.PE_mode: # return None # pchrnames = sorted(peaks.get_chr_names()) # retval = [] # # this object should be sorted # if not self.treat.__sorted: self.treat.sort() # # PeakIO object should be sorted # peaks.sort() # chrnames = self.treat.get_chr_names() # ret_peaks = PeakIO() # for c in range(len(pchrnames)): # chrom = pchrnames[c] # assert chrom in chrnames, "chromosome %s can't be found in the FWTrackIII object. %s" % (chrom, str(chrnames)) # (plus, minus) = self.treat.__locations[chrom] # cpeaks = peaks.get_data_from_chrom(chrom) # prev_i = 0 # prev_j = 0 # for m in range(len(cpeaks)): # thispeak = cpeaks[m] # startpos = thispeak["start"] - window_size # endpos = thispeak["end"] + window_size # temp = [] # for i in range(prev_i,plus.shape[0]): # pos = plus[i] # if pos < startpos: # continue # elif pos > endpos: # prev_i = i # break # else: # temp.append(pos) # rt_plus = np.array(temp) # temp = [] # for j in range(prev_j,minus.shape[0]): # pos = minus[j] # if pos < startpos: # continue # elif pos > endpos: # prev_j = j # break # else: # temp.append(pos) # rt_minus = np.array(temp) # (adjusted_summits, passflags) = wtd_find_summit(chrom, rt_plus, rt_minus, startpos, endpos, window_size, cutoff) # # those local maxima above cutoff will be defined as good summits # for i in range(len(adjusted_summits)): # adjusted_summit = adjusted_summits[i] # passflag = passflags[i] # if passflag: # tmppeak = copy(thispeak) # tmppeak["summit"] = adjusted_summit # ret_peaks.add(chrom, tmppeak) # # rewind window_size # for i in range(prev_i, 0, -1): # if plus[prev_i] - plus[i] >= window_size: # break # prev_i = i # for j in range(prev_j, 0, -1): # if minus[prev_j] - minus[j] >= window_size: # break # prev_j = j # # end of a loop # return ret_peaks