changeset 22:63dace20719b draft

Uploaded
author alenail
date Wed, 13 Apr 2016 17:36:07 -0400
parents 248c4538e4ce
children dd504ac7bf6d
files chipsequtil/map_to_known_genes.py chipsequtil/map_to_known_genes.xml chipsequtil/map_to_known_genes/._map_to_known_genes.py chipsequtil/map_to_known_genes/._map_to_known_genes.xml chipsequtil/map_to_known_genes/map_to_known_genes.py chipsequtil/map_to_known_genes/map_to_known_genes.xml chipsequtil/map_to_known_genes/tool_dependencies.xml chipsequtil/pieplot_macs/pieplots_macs.py chipsequtil/pieplot_macs/pieplots_macs.xml chipsequtil/tool_dependencies.xml
diffstat 10 files changed, 440 insertions(+), 291 deletions(-) [+]
line wrap: on
line diff
--- a/chipsequtil/map_to_known_genes.py	Mon Apr 04 16:41:59 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,230 +0,0 @@
-#!/usr/local/bin/python
-
-import sys, os
-from optparse import OptionParser
-from collections import defaultdict as dd
-from csv import DictReader, DictWriter
-
-from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number
-from chipsequtil.util import MultiLineHelpFormatter
-
-usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>'
-description = """
-Map the peaks in <peaks file> to genes in <knownGene file>.  <knownGene file> is\
-format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\
-<peaks file> format is as produced by MACS.  If *auto* is chosen (default) file extension \
-is examined for *.xls* for default MACS format or *.bed* for BED format.  If the --detail \
-option is provided, the following extra fields are appended to each row:
-
-peak loc, dist from feature, map type, map subtype
-"""
-epilog = ''
-parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())
-parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]')
-parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]')
-parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site')
-parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]')
-parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]')
-parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]')
-parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description')
-parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID')
-#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column')
-
-# TODO - options
-#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping')
-#parser.add_option('--capture-intergenic'...)
-#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]')
-#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]')
-
-def parse_gene_ref(ref_gene) :
-    reader = KnownGeneFile(ref_gene)
-    gene_ref = dd(list)
-    for ref_dict in reader :
-        gene_ref[ref_dict['chrom']].append(ref_dict)
-
-    return gene_ref
-
-
-def parse_gene_ref_line(l) :
-    l = map(parse_number, l) # coerce to numbers where possible
-    l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list
-    l[10] = map(parse_number, l[10].split(','))
-    return l
-
-
-def main():
-    opts, args = parser.parse_args(sys.argv[1:])
-
-    if len(args) < 3 :
-        parser.error('Must provide three filename arguments')
-
-    gene_ref = parse_gene_ref(args[0])
-    xref_fn = args[1]
-    peaks_fn = args[2]
-
-    if opts.peaks_fmt == 'MACS' :
-        peaks_reader_cls = MACSFile
-        chr_field, start_field, end_field = 'chr', 'start', 'end'
-    elif opts.peaks_fmt == 'BED' :
-        peaks_reader_cls = BEDFile
-        chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd'
-    else :
-        # should never happen
-        fieldnames = []
-
-    #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t')
-    peaks_reader = peaks_reader_cls(peaks_fn)
-
-    # default output format:
-    if opts.peak_output :
-        peak_output = open(opts.peak_output,'w')
-    else :
-        peak_output = sys.stdout
-
-    fieldnames = peaks_reader.FIELD_NAMES
-    if opts.detail :
-        fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score"
-    output_fields = ['knownGeneID']+fieldnames
-
-    # see if the user wants gene symbols too
-    # TODO - actually make this an option, or make it required
-    opts.symbol_xref = xref_fn
-    if opts.symbol_xref :
-        kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description']
-        symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t')
-        symbol_xref_map = {}
-        for rec in symbol_xref_reader :
-            symbol_xref_map[rec['kgID']] = rec
-        output_fields = ['knownGeneID','geneSymbol']+fieldnames
-
-    peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n')
-    peaks_writer.writerow(dict([(k,k) for k in output_fields]))
-    unique_genes = set()
-    map_stats = dd(int)
-    for peak in peaks_reader :
-
-        # if this is a comment or header line get skip it
-        if peak[fieldnames[0]].startswith('#') or \
-           peak[fieldnames[0]] == fieldnames[0] or \
-           peak[fieldnames[0]].startswith('track') : continue
-
-        # coerce values to numeric if possible
-        for k,v in peak.items() : peak[k] = parse_number(v)
-
-        # MACS output gives us summit
-        if opts.peaks_fmt == 'MACS' :
-            peak_loc = peak[start_field]+peak['summit']
-        else : # peak assumed to be in the middle of the reported peak range
-            peak_loc = (peak[start_field]+peak[end_field])/2
-
-        chrom_genes = gene_ref[peak[chr_field]]
-
-        if len(chrom_genes) == 0 :
-            sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak))
-            continue
-
-        mapped = False
-
-        # walk through the genes for this chromosome
-        for gene in chrom_genes :
-
-            # reusable dictionary for output
-            out_d = {}.fromkeys(output_fields,0)
-            out_d.update(peak)
-            out_d['map type'] = ''
-            out_d['chromo'] = peak[chr_field]
-            out_d['peak loc'] = peak_loc
-
-            # determine intervals for promoter, gene, and downstream
-            if gene['strand'] == '+' :
-                promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1
-                if opts.tss :
-                    gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win)
-                    downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win
-                else :
-                    gene_coords = gene['txStart'], gene['txEnd']
-                    downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win
-            else :
-                promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing
-                if opts.tss :
-                    gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']
-                    downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
-                else :
-                    gene_coords = gene['txStart'], gene['txEnd']
-                    downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
-
-            # check for promoter
-            if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :
-                out_d['map type'] = 'promoter'
-                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
-
-            # check for gene
-            elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :
-                # check for intron/exon
-                exon_coords = zip(gene['exonStarts'],gene['exonEnds'])
-                in_exon = False
-                for st,en in exon_coords :
-                    if peak_loc >= st and peak_loc <= en :
-                        in_exon = True
-                        break
-                out_d['map type'] = 'gene'
-                out_d['map subtype'] = 'exon' if in_exon else 'intron'
-
-                #Commented out to keep score reported in bed file - AJD 7/29/14
-                # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene
-                #gene_len = float(gene_coords[1]-gene_coords[0])
-                #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len
-
-                # distance calculated from start of gene
-                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
-
-                map_stats[out_d['map subtype']] += 1
-
-            # check for downstream
-            elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :
-                out_d['map type'] = 'after'
-                if opts.tss :
-                    out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc
-                else :
-                    out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc
-
-            # does not map to this gene
-            else :
-                pass
-
-            # map type is not blank if we mapped to something
-            if out_d['map type'] != '' :
-
-                #out_d = {'knownGeneID':gene['name']}
-                out_d['knownGeneID'] = gene['name']
-                if opts.symbol_xref :
-                    out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']
-                peaks_writer.writerow(out_d)
-                mapped = True
-
-                # reset map_type
-                out_d['map type'] = ''
-
-        if not mapped :
-            if opts.intergenic :
-                out_d['knownGeneID'] = 'None'
-                out_d['geneSymbol'] = 'None'
-                out_d['map type'] = 'intergenic'
-                peaks_writer.writerow(out_d)
-            map_stats['intergenic'] += 1
-
-    if peak_output != sys.stdout:
-        peak_output.close()
-
-    #if opts.stats_output != sys.stderr :
-    #    opts.stats_output = open(opts.stats_output,'w')
-
-    #for k,v in map_stats.items() :
-    #    opts.stats_output.write('%s: %s\n'%(k,v))
-
-    #if opts.stats_output != sys.stderr :
-    #    opts.stats_output.close()
-
-
-if __name__ == '__main__' :
-    main()
--- a/chipsequtil/map_to_known_genes.xml	Mon Apr 04 16:41:59 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1">
-  <description>
-    Map the peaks in &lt;peaks file&gt; to genes in &lt;knownGene file&gt;.  &lt;knownGene file&gt; isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.&lt;peaks file&gt; format is as produced by MACS.  If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format.  If the --detail option is provided, the following extra fields are appended to each row:
-    peak loc, dist from feature, map type, map subtype
-  </description>
-  <parallelism method="basic"></parallelism>
-  <requirements>
-    <requirement type="package" version="1.0">chipsequtil</requirement>
-  </requirements>
-  <command interpreter="python">
-    map_to_known_genes.py
-      $tss
-      --peaks-format=$peaks_fmt
-      --upstream-window=$upst_win
-      --downstream-window=$dnst_win
-      --map-output="$peaksOutput"
-      $detail
-      $intergenic
-      $knownGeneFile $knownGeneRef $macsPeaksFile
-
-  </command>
-  <inputs>
-    <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" />
-    <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" />
-    <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" />
-
-    <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" />
-    <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" />
-
-    <param name="tss" checked="true" label="Calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" />
-
-    <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false">
-        <option value="MACS">MACS</option>
-        <option selected="true" value="BED">BED</option>
-    </param>
-
-    <param name="detail" checked="true" label="Append the following fields to each row: peak loc, dist from feature, map type, map subtype" type="boolean" truevalue="--detail" falsevalue="" help="" />
-    <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" />
-  </inputs>
-  <outputs>
-    <data name="peaksOutput" format="txt" hidden="false" />
-  </outputs>
-  <help></help>
-</tool>
Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.py has changed
Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.xml has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chipsequtil/map_to_known_genes/map_to_known_genes.py	Wed Apr 13 17:36:07 2016 -0400
@@ -0,0 +1,230 @@
+#!/usr/local/bin/python
+
+import sys, os
+from optparse import OptionParser
+from collections import defaultdict as dd
+from csv import DictReader, DictWriter
+
+from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number
+from chipsequtil.util import MultiLineHelpFormatter
+
+usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>'
+description = """
+Map the peaks in <peaks file> to genes in <knownGene file>.  <knownGene file> is\
+format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\
+<peaks file> format is as produced by MACS.  If *auto* is chosen (default) file extension \
+is examined for *.xls* for default MACS format or *.bed* for BED format.  If the --detail \
+option is provided, the following extra fields are appended to each row:
+
+peak loc, dist from feature, map type, map subtype
+"""
+epilog = ''
+parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())
+parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]')
+parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]')
+parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site')
+parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]')
+parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]')
+parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]')
+parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description')
+parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID')
+#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column')
+
+# TODO - options
+#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping')
+#parser.add_option('--capture-intergenic'...)
+#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]')
+#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]')
+
+def parse_gene_ref(ref_gene) :
+    reader = KnownGeneFile(ref_gene)
+    gene_ref = dd(list)
+    for ref_dict in reader :
+        gene_ref[ref_dict['chrom']].append(ref_dict)
+
+    return gene_ref
+
+
+def parse_gene_ref_line(l) :
+    l = map(parse_number, l) # coerce to numbers where possible
+    l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list
+    l[10] = map(parse_number, l[10].split(','))
+    return l
+
+
+def main():
+    opts, args = parser.parse_args(sys.argv[1:])
+
+    if len(args) < 3 :
+        parser.error('Must provide three filename arguments')
+
+    gene_ref = parse_gene_ref(args[0])
+    xref_fn = args[1]
+    peaks_fn = args[2]
+
+    if opts.peaks_fmt == 'MACS' :
+        peaks_reader_cls = MACSFile
+        chr_field, start_field, end_field = 'chr', 'start', 'end'
+    elif opts.peaks_fmt == 'BED' :
+        peaks_reader_cls = BEDFile
+        chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd'
+    else :
+        # should never happen
+        fieldnames = []
+
+    #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t')
+    peaks_reader = peaks_reader_cls(peaks_fn)
+
+    # default output format:
+    if opts.peak_output :
+        peak_output = open(opts.peak_output,'w')
+    else :
+        peak_output = sys.stdout
+
+    fieldnames = peaks_reader.FIELD_NAMES
+    if opts.detail :
+        fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score"
+    output_fields = ['knownGeneID']+fieldnames
+
+    # see if the user wants gene symbols too
+    # TODO - actually make this an option, or make it required
+    opts.symbol_xref = xref_fn
+    if opts.symbol_xref :
+        kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description']
+        symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t')
+        symbol_xref_map = {}
+        for rec in symbol_xref_reader :
+            symbol_xref_map[rec['kgID']] = rec
+        output_fields = ['knownGeneID','geneSymbol']+fieldnames
+
+    peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n')
+    peaks_writer.writerow(dict([(k,k) for k in output_fields]))
+    unique_genes = set()
+    map_stats = dd(int)
+    for peak in peaks_reader :
+
+        # if this is a comment or header line get skip it
+        if peak[fieldnames[0]].startswith('#') or \
+           peak[fieldnames[0]] == fieldnames[0] or \
+           peak[fieldnames[0]].startswith('track') : continue
+
+        # coerce values to numeric if possible
+        for k,v in peak.items() : peak[k] = parse_number(v)
+
+        # MACS output gives us summit
+        if opts.peaks_fmt == 'MACS' :
+            peak_loc = peak[start_field]+peak['summit']
+        else : # peak assumed to be in the middle of the reported peak range
+            peak_loc = (peak[start_field]+peak[end_field])/2
+
+        chrom_genes = gene_ref[peak[chr_field]]
+
+        if len(chrom_genes) == 0 :
+            sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak))
+            continue
+
+        mapped = False
+
+        # walk through the genes for this chromosome
+        for gene in chrom_genes :
+
+            # reusable dictionary for output
+            out_d = {}.fromkeys(output_fields,0)
+            out_d.update(peak)
+            out_d['map type'] = ''
+            out_d['chromo'] = peak[chr_field]
+            out_d['peak loc'] = peak_loc
+
+            # determine intervals for promoter, gene, and downstream
+            if gene['strand'] == '+' :
+                promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1
+                if opts.tss :
+                    gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win)
+                    downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win
+                else :
+                    gene_coords = gene['txStart'], gene['txEnd']
+                    downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win
+            else :
+                promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing
+                if opts.tss :
+                    gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']
+                    downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
+                else :
+                    gene_coords = gene['txStart'], gene['txEnd']
+                    downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
+
+            # check for promoter
+            if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :
+                out_d['map type'] = 'promoter'
+                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
+
+            # check for gene
+            elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :
+                # check for intron/exon
+                exon_coords = zip(gene['exonStarts'],gene['exonEnds'])
+                in_exon = False
+                for st,en in exon_coords :
+                    if peak_loc >= st and peak_loc <= en :
+                        in_exon = True
+                        break
+                out_d['map type'] = 'gene'
+                out_d['map subtype'] = 'exon' if in_exon else 'intron'
+
+                #Commented out to keep score reported in bed file - AJD 7/29/14
+                # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene
+                #gene_len = float(gene_coords[1]-gene_coords[0])
+                #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len
+
+                # distance calculated from start of gene
+                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
+
+                map_stats[out_d['map subtype']] += 1
+
+            # check for downstream
+            elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :
+                out_d['map type'] = 'after'
+                if opts.tss :
+                    out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc
+                else :
+                    out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc
+
+            # does not map to this gene
+            else :
+                pass
+
+            # map type is not blank if we mapped to something
+            if out_d['map type'] != '' :
+
+                #out_d = {'knownGeneID':gene['name']}
+                out_d['knownGeneID'] = gene['name']
+                if opts.symbol_xref :
+                    out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']
+                peaks_writer.writerow(out_d)
+                mapped = True
+
+                # reset map_type
+                out_d['map type'] = ''
+
+        if not mapped :
+            if opts.intergenic :
+                out_d['knownGeneID'] = 'None'
+                out_d['geneSymbol'] = 'None'
+                out_d['map type'] = 'intergenic'
+                peaks_writer.writerow(out_d)
+            map_stats['intergenic'] += 1
+
+    if peak_output != sys.stdout:
+        peak_output.close()
+
+    #if opts.stats_output != sys.stderr :
+    #    opts.stats_output = open(opts.stats_output,'w')
+
+    #for k,v in map_stats.items() :
+    #    opts.stats_output.write('%s: %s\n'%(k,v))
+
+    #if opts.stats_output != sys.stderr :
+    #    opts.stats_output.close()
+
+
+if __name__ == '__main__' :
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chipsequtil/map_to_known_genes/map_to_known_genes.xml	Wed Apr 13 17:36:07 2016 -0400
@@ -0,0 +1,44 @@
+<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1">
+  <description>
+    Map the peaks in &lt;peaks file&gt; to genes in &lt;knownGene file&gt;.  &lt;knownGene file&gt; isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.&lt;peaks file&gt; format is as produced by MACS.  If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format.  If the --detail option is provided, the following extra fields are appended to each row:
+    peak loc, dist from feature, map type, map subtype
+  </description>
+  <parallelism method="basic"></parallelism>
+  <requirements>
+    <requirement type="package" version="1.0">chipsequtil</requirement>
+  </requirements>
+  <command interpreter="python">
+    map_to_known_genes.py
+      $tss
+      --peaks-format=$peaks_fmt
+      --upstream-window=$upst_win
+      --downstream-window=$dnst_win
+      --map-output="$peaksOutput"
+      $detail
+      $intergenic
+      $knownGeneFile $knownGeneRef $macsPeaksFile
+
+  </command>
+  <inputs>
+    <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" />
+    <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" />
+    <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" />
+
+    <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" />
+    <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" />
+
+    <param name="tss" checked="true" label="Calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" />
+
+    <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false">
+        <option value="MACS">MACS</option>
+        <option selected="true" value="BED">BED</option>
+    </param>
+
+    <param name="detail" checked="true" label="Append the following fields to each row: peak loc, dist from feature, map type, map subtype" type="boolean" truevalue="--detail" falsevalue="" help="" />
+    <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" />
+  </inputs>
+  <outputs>
+    <data name="peaksOutput" format="txt" hidden="false" />
+  </outputs>
+  <help></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chipsequtil/map_to_known_genes/tool_dependencies.xml	Wed Apr 13 17:36:07 2016 -0400
@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="chipsequtil" version="1.0">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">https://github.com/fraenkel-lab/chipsequtil/archive/master.zip</action>
+                <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action>
+                <action type="shell_command">python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin</action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                    <environment_variable name="PYTHONPATH" action="prepend_to">$INSTALL_DIR/lib/python</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme></readme>
+    </package>
+</tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chipsequtil/pieplot_macs/pieplots_macs.py	Wed Apr 13 17:36:07 2016 -0400
@@ -0,0 +1,127 @@
+'''
+ NAME
+
+ pieplots_macs.py
+
+ SYNOPSIS
+
+python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf
+
+
+ DESCRIPTION
+
+Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela)
+
+'''
+
+__author__='Renan Escalante'
+__email__='renanec@mit.edu'
+
+import matplotlib
+matplotlib.use('pdf')
+from matplotlib import pyplot as plt
+matplotlib.rcParams['pdf.fonttype']=42
+matplotlib.rcParams['font.size']=14
+import sys
+from argparse import ArgumentParser
+import pandas as pd
+
+def map_peaks(gene,peak,outfile,macsFlag):
+    genefile = open(gene, 'r')
+    peakfile = open(peak, 'r')
+
+    types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0}
+
+    #read mapped gene file, store closest map for each peak
+    peaks={} #{chrom:{peakStart:[dist, type]}}
+    for line in genefile:
+        words = line.strip().split('\t')
+        #ignore first line
+        if words[0] == 'knownGeneID': continue
+        chrom = words[2]
+
+
+        if not macsFlag:
+            try:
+                start = int(words[3])
+                dist = abs(int(words[15]))
+                maptype = words[16]
+                if maptype == 'gene':
+                    maptype = words[17]
+            except:
+                pass
+
+        else:
+            start = int(words[3])-1
+            dist = abs(int(words[12]))
+            maptype = words[14]
+            if maptype == 'gene':
+                maptype = words[15]
+
+
+        if chrom not in peaks:
+            #new chrom
+            peaks[chrom] = {start:[dist,maptype]}
+        else:
+            if start in peaks[chrom].keys():
+                #account for duplicate entries - choose closest gene and store type
+                if dist < peaks[chrom][start][0]:
+                    #closer gene
+                    peaks[chrom][start] = [dist, maptype]
+            else: peaks[chrom][start] = [dist, maptype]
+
+    #count types - 1 per peak in peak file
+    types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0}
+    totalpks = 0
+    #Read peak file in bed format
+    for line in peakfile:
+        words = line.strip().split('\t')
+        chrom = words[0]
+        start = int(words[1])
+        if chrom in peaks:
+            if start in peaks[chrom]:
+                types[peaks[chrom][start][1]] += 1
+            else:
+                types['inter'] += 1
+        else:
+            types['inter'] += 1
+        totalpks += 1
+
+
+    #--------------------------------------------
+    #  make a square figure and axes
+    #--------------------------------------------
+
+    fig = plt.figure(figsize=(6,6))
+    pie_ax = fig.add_axes((0.3,0.2,0.4,0.4))
+
+    # The slices will be ordered and plotted counter-clockwise.
+    labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']]
+    fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']]
+
+    plt.pie(fracs, labels=labels) #, autopct='%1.1f%%')
+
+    #Export data frame with all the counts
+    indexDataFrame = ['exon','intron','promoter','intergenic','after']
+    df = pd.DataFrame(data=fracs, index=indexDataFrame)
+    dfFileName = outfile.replace("pdf","csv")
+    df.to_csv(dfFileName, sep=',')
+    #plt.title('MACS peaks in %s'%(name))
+    plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center')
+    fig.savefig(outfile)
+
+def main():
+    usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf"
+    parser = ArgumentParser(usage)
+    parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt")
+    parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed")
+    parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart")
+    parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks')
+
+    args=parser.parse_args()
+
+    map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS)
+
+
+if __name__=='__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chipsequtil/pieplot_macs/pieplots_macs.xml	Wed Apr 13 17:36:07 2016 -0400
@@ -0,0 +1,22 @@
+<tool id="pieplots_macs" name="Pieplots MACS" version="0.1">
+  <description>
+    Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic.
+  </description>
+  <parallelism method="basic"></parallelism>
+  <requirements>
+    <requirement type="package" version="1.2.1">matplotlib</requirement>
+    <requirement type="package">pandas</requirement>
+  </requirements>
+  <command interpreter="python">
+    pieplots_macs.py genefile $genefilepeakfile $peakfile $MACSoutfile $out
+  </command>
+  <inputs>
+    <param name="genefile" type="data" label="genefile file" help="" optional="false" />
+    <param name="peakfile" type="data" label="peakfile xRef file" help="" optional="false" />
+    <param name="MACS" checked="false" label="Set this value if you have MACS peaks" type="boolean" truevalue="--MACS" falsevalue="" help="" />
+  </inputs>
+  <outputs>
+    <data name="out" format="pdf" hidden="false" />
+  </outputs>
+  <help></help>
+</tool>
--- a/chipsequtil/tool_dependencies.xml	Mon Apr 04 16:41:59 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="chipsequtil" version="1.0">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">https://github.com/fraenkel-lab/chipsequtil/archive/master.zip</action>
-                <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action>
-                <action type="shell_command">python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin</action>
-                <action type="set_environment">
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                    <environment_variable name="PYTHONPATH" action="prepend_to">$INSTALL_DIR/lib/python</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme></readme>
-    </package>
-</tool_dependency>