# Time-stamp: <2014-06-17 00:42:04 Tao Liu> """Description: Filter duplicate reads depending on sequencing depth. Copyright (c) 2011 Tao Liu This code is free software; you can redistribute it and/or modify it under the terms of the BSD License (see the file COPYING included with the distribution). @status: release candidate @version: $Id$ @author: Yong Zhang, Tao Liu @contact: taoliu@jimmy.harvard.edu """ # ------------------------------------ # python modules # ------------------------------------ import os import sys import logging # ------------------------------------ # own python modules # ------------------------------------ from MACS2.OptValidator import opt_validate_filterdup as opt_validate from MACS2.cProb import binomial_cdf_inv from MACS2.Constants import * # ------------------------------------ # Main function # ------------------------------------ def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error if options.outputfile != "stdout": outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" ) else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("calculate max duplicate tags in single position based on binomal distribution...") max_dup_tags = cal_max_dup_tags(options.gsize,t0) info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) else: info("user defined the maximum tags...") max_dup_tags = int(options.keepduplicates) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) if not options.dryrun: fwtrack = fwtrack.filter_dup(max_dup_tags) t1 = fwtrack.total else: t1 = fwtrack.filter_dup_dryrun( max_dup_tags ) info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0)) if not options.dryrun: info( "Write to BED file" ) fwtrack.print_to_bed( fhd=outfhd ) info( "finished! Check %s." % options.outputfile ) else: info( "Dry-run is finished!" ) def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) def load_tag_files_options ( options ): """From the options, load alignment tags. """ options.info("read alignment tags...") tp = options.parser(options.ifile) if not options.tsize: # override tsize if user specified --tsize ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() treat.sort() options.info("tag size is determined as %d bps" % options.tsize) return treat