Previous changeset 21:248c4538e4ce (2016-04-04) Next changeset 23:dd504ac7bf6d (2016-04-13) |
Commit message:
Uploaded |
added:
chipsequtil/map_to_known_genes/._map_to_known_genes.py chipsequtil/map_to_known_genes/._map_to_known_genes.xml chipsequtil/map_to_known_genes/map_to_known_genes.py chipsequtil/map_to_known_genes/map_to_known_genes.xml chipsequtil/map_to_known_genes/tool_dependencies.xml chipsequtil/pieplot_macs/pieplots_macs.py chipsequtil/pieplot_macs/pieplots_macs.xml |
removed:
chipsequtil/map_to_known_genes.py chipsequtil/map_to_known_genes.xml chipsequtil/tool_dependencies.xml |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes.py --- a/chipsequtil/map_to_known_genes.py Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,230 +0,0 @@\n-#!/usr/local/bin/python\n-\n-import sys, os\n-from optparse import OptionParser\n-from collections import defaultdict as dd\n-from csv import DictReader, DictWriter\n-\n-from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number\n-from chipsequtil.util import MultiLineHelpFormatter\n-\n-usage = \'%prog [options] <knownGene file> <knownGene xRef file> <peaks file>\'\n-description = """\n-Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\\\n-format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\\\n-<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \\\n-is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \\\n-option is provided, the following extra fields are appended to each row:\n-\n-peak loc, dist from feature, map type, map subtype\n-"""\n-epilog = \'\'\n-parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())\n-parser.add_option(\'--upstream-window\',dest=\'upst_win\',type=\'int\',default=5500,help=\'window width in base pairs to consider promoter region [default: %default]\')\n-parser.add_option(\'--downstream-window\',dest=\'dnst_win\',type=\'int\',default=2500,help=\'window width in base pairs to consider downstream region [default: %default]\')\n-parser.add_option(\'--tss\',dest=\'tss\',action=\'store_true\',help=\'calculate downstream window from transcription start site instead of transcription end site\')\n-parser.add_option(\'--map-output\',dest=\'peak_output\',default=None,help=\'filename to output mapped peaks to [default: stdout]\')\n-parser.add_option(\'--stats-output\',dest=\'stats_output\',default=sys.stderr,help=\'filename to output summary stats in conversion [default: stderr]\')\n-parser.add_option(\'--peaks-format\',dest=\'peaks_fmt\',default=\'auto\',type=\'choice\',choices=[\'auto\',\'MACS\',\'BED\'],help=\'format of peaks input file [default: %default]\')\n-parser.add_option(\'--detail\',dest=\'detail\',action=\'store_true\',help=\'add extra fields to output, see description\')\n-parser.add_option(\'--intergenic\',dest=\'intergenic\',action=\'store_true\',help=\'write intergenic peaks to the gene file as well with None as gene ID\')\n-#parser.add_option(\'--symbol-xref\',dest=\'symbol_xref\',default=None,help=\'use the kgXref table file supplied to find a gene symbol, output as second column\')\n-\n-# TODO - options\n-#parser.add_option(\'--use-cds\',dest=\'use_cds\',action=\'store_true\',help=\'use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping\')\n-#parser.add_option(\'--capture-intergenic\'...)\n-#parser.add_option(\'--map-format\',dest=\'peak_format\',type=\'choice\',choices=[\'default\',\'BED\'],help=\'format of peak output [default: %default]\')\n-#parser.add_option(\'--stats-format\',dest=\'stats_format\',type=\'choice\',choices=[\'human\',\'python\'],help=\'format of summary stats output [default: %default]\')\n-\n-def parse_gene_ref(ref_gene) :\n- reader = KnownGeneFile(ref_gene)\n- gene_ref = dd(list)\n- for ref_dict in reader :\n- gene_ref[ref_dict[\'chrom\']].append(ref_dict)\n-\n- return gene_ref\n-\n-\n-def parse_gene_ref_line(l) :\n- l = map(parse_number, l) # coerce to numbers where possible\n- l[9] = map(parse_number, l[9].split(\',\')) # turn \'x,x,x,...\' into list\n- l[10] = map(parse_number, l[10].split(\',\'))\n- return l\n-\n-\n-def main():\n- opts, args = parser.parse_args(sys.argv[1:])\n-\n- if len(args) < 3 :\n- parser.error(\'Must provide three filename arguments\')\n-\n- gene_ref = parse_gene_ref(args[0])\n- xref_fn = args[1]\n- peaks_fn = args[2]\n-\n- if opts.peaks_fmt == \'MACS\' :\n- peaks_reader_cls = MACSFile\n- chr_field, start_field, end_field = \'chr\', \'start\', \'end\'\n- elif opts.peaks_fmt == \'BED\' :\n- peaks_reader_cls = BEDFile\n- chr_field, start_field, end_field = \'chrom\', \'chromStart\', \'chromEnd\'\n- else :\n- # should never happen\n- fieldnames = []\n-\n- #peaks_reader = DictReader(open(args[1]),fieldna'..b"e['txStart'], gene['txEnd']\n- downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win\n- else :\n- promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing\n- if opts.tss :\n- gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']\n- downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n- else :\n- gene_coords = gene['txStart'], gene['txEnd']\n- downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n-\n- # check for promoter\n- if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :\n- out_d['map type'] = 'promoter'\n- out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n-\n- # check for gene\n- elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :\n- # check for intron/exon\n- exon_coords = zip(gene['exonStarts'],gene['exonEnds'])\n- in_exon = False\n- for st,en in exon_coords :\n- if peak_loc >= st and peak_loc <= en :\n- in_exon = True\n- break\n- out_d['map type'] = 'gene'\n- out_d['map subtype'] = 'exon' if in_exon else 'intron'\n-\n- #Commented out to keep score reported in bed file - AJD 7/29/14\n- # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene\n- #gene_len = float(gene_coords[1]-gene_coords[0])\n- #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len\n-\n- # distance calculated from start of gene\n- out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n-\n- map_stats[out_d['map subtype']] += 1\n-\n- # check for downstream\n- elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :\n- out_d['map type'] = 'after'\n- if opts.tss :\n- out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc\n- else :\n- out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc\n-\n- # does not map to this gene\n- else :\n- pass\n-\n- # map type is not blank if we mapped to something\n- if out_d['map type'] != '' :\n-\n- #out_d = {'knownGeneID':gene['name']}\n- out_d['knownGeneID'] = gene['name']\n- if opts.symbol_xref :\n- out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']\n- peaks_writer.writerow(out_d)\n- mapped = True\n-\n- # reset map_type\n- out_d['map type'] = ''\n-\n- if not mapped :\n- if opts.intergenic :\n- out_d['knownGeneID'] = 'None'\n- out_d['geneSymbol'] = 'None'\n- out_d['map type'] = 'intergenic'\n- peaks_writer.writerow(out_d)\n- map_stats['intergenic'] += 1\n-\n- if peak_output != sys.stdout:\n- peak_output.close()\n-\n- #if opts.stats_output != sys.stderr :\n- # opts.stats_output = open(opts.stats_output,'w')\n-\n- #for k,v in map_stats.items() :\n- # opts.stats_output.write('%s: %s\\n'%(k,v))\n-\n- #if opts.stats_output != sys.stderr :\n- # opts.stats_output.close()\n-\n-\n-if __name__ == '__main__' :\n- main()\n" |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes.xml --- a/chipsequtil/map_to_known_genes.xml Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,44 +0,0 @@ -<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1"> - <description> - Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: - peak loc, dist from feature, map type, map subtype - </description> - <parallelism method="basic"></parallelism> - <requirements> - <requirement type="package" version="1.0">chipsequtil</requirement> - </requirements> - <command interpreter="python"> - map_to_known_genes.py - $tss - --peaks-format=$peaks_fmt - --upstream-window=$upst_win - --downstream-window=$dnst_win - --map-output="$peaksOutput" - $detail - $intergenic - $knownGeneFile $knownGeneRef $macsPeaksFile - - </command> - <inputs> - <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" /> - <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" /> - <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" /> - - <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" /> - <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" /> - - <param name="tss" checked="true" label="Calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" /> - - <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false"> - <option value="MACS">MACS</option> - <option selected="true" value="BED">BED</option> - </param> - - <param name="detail" checked="true" label="Append the following fields to each row: peak loc, dist from feature, map type, map subtype" type="boolean" truevalue="--detail" falsevalue="" help="" /> - <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" /> - </inputs> - <outputs> - <data name="peaksOutput" format="txt" hidden="false" /> - </outputs> - <help></help> -</tool> |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/._map_to_known_genes.py |
b |
Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.py has changed |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/._map_to_known_genes.xml |
b |
Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.xml has changed |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/map_to_known_genes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/map_to_known_genes.py Wed Apr 13 17:36:07 2016 -0400 |
[ |
b'@@ -0,0 +1,230 @@\n+#!/usr/local/bin/python\n+\n+import sys, os\n+from optparse import OptionParser\n+from collections import defaultdict as dd\n+from csv import DictReader, DictWriter\n+\n+from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number\n+from chipsequtil.util import MultiLineHelpFormatter\n+\n+usage = \'%prog [options] <knownGene file> <knownGene xRef file> <peaks file>\'\n+description = """\n+Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\\\n+format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\\\n+<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \\\n+is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \\\n+option is provided, the following extra fields are appended to each row:\n+\n+peak loc, dist from feature, map type, map subtype\n+"""\n+epilog = \'\'\n+parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())\n+parser.add_option(\'--upstream-window\',dest=\'upst_win\',type=\'int\',default=5500,help=\'window width in base pairs to consider promoter region [default: %default]\')\n+parser.add_option(\'--downstream-window\',dest=\'dnst_win\',type=\'int\',default=2500,help=\'window width in base pairs to consider downstream region [default: %default]\')\n+parser.add_option(\'--tss\',dest=\'tss\',action=\'store_true\',help=\'calculate downstream window from transcription start site instead of transcription end site\')\n+parser.add_option(\'--map-output\',dest=\'peak_output\',default=None,help=\'filename to output mapped peaks to [default: stdout]\')\n+parser.add_option(\'--stats-output\',dest=\'stats_output\',default=sys.stderr,help=\'filename to output summary stats in conversion [default: stderr]\')\n+parser.add_option(\'--peaks-format\',dest=\'peaks_fmt\',default=\'auto\',type=\'choice\',choices=[\'auto\',\'MACS\',\'BED\'],help=\'format of peaks input file [default: %default]\')\n+parser.add_option(\'--detail\',dest=\'detail\',action=\'store_true\',help=\'add extra fields to output, see description\')\n+parser.add_option(\'--intergenic\',dest=\'intergenic\',action=\'store_true\',help=\'write intergenic peaks to the gene file as well with None as gene ID\')\n+#parser.add_option(\'--symbol-xref\',dest=\'symbol_xref\',default=None,help=\'use the kgXref table file supplied to find a gene symbol, output as second column\')\n+\n+# TODO - options\n+#parser.add_option(\'--use-cds\',dest=\'use_cds\',action=\'store_true\',help=\'use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping\')\n+#parser.add_option(\'--capture-intergenic\'...)\n+#parser.add_option(\'--map-format\',dest=\'peak_format\',type=\'choice\',choices=[\'default\',\'BED\'],help=\'format of peak output [default: %default]\')\n+#parser.add_option(\'--stats-format\',dest=\'stats_format\',type=\'choice\',choices=[\'human\',\'python\'],help=\'format of summary stats output [default: %default]\')\n+\n+def parse_gene_ref(ref_gene) :\n+ reader = KnownGeneFile(ref_gene)\n+ gene_ref = dd(list)\n+ for ref_dict in reader :\n+ gene_ref[ref_dict[\'chrom\']].append(ref_dict)\n+\n+ return gene_ref\n+\n+\n+def parse_gene_ref_line(l) :\n+ l = map(parse_number, l) # coerce to numbers where possible\n+ l[9] = map(parse_number, l[9].split(\',\')) # turn \'x,x,x,...\' into list\n+ l[10] = map(parse_number, l[10].split(\',\'))\n+ return l\n+\n+\n+def main():\n+ opts, args = parser.parse_args(sys.argv[1:])\n+\n+ if len(args) < 3 :\n+ parser.error(\'Must provide three filename arguments\')\n+\n+ gene_ref = parse_gene_ref(args[0])\n+ xref_fn = args[1]\n+ peaks_fn = args[2]\n+\n+ if opts.peaks_fmt == \'MACS\' :\n+ peaks_reader_cls = MACSFile\n+ chr_field, start_field, end_field = \'chr\', \'start\', \'end\'\n+ elif opts.peaks_fmt == \'BED\' :\n+ peaks_reader_cls = BEDFile\n+ chr_field, start_field, end_field = \'chrom\', \'chromStart\', \'chromEnd\'\n+ else :\n+ # should never happen\n+ fieldnames = []\n+\n+ #peaks_reader = DictReader(open(args[1]),fieldna'..b"e['txStart'], gene['txEnd']\n+ downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win\n+ else :\n+ promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing\n+ if opts.tss :\n+ gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']\n+ downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n+ else :\n+ gene_coords = gene['txStart'], gene['txEnd']\n+ downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n+\n+ # check for promoter\n+ if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :\n+ out_d['map type'] = 'promoter'\n+ out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n+\n+ # check for gene\n+ elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :\n+ # check for intron/exon\n+ exon_coords = zip(gene['exonStarts'],gene['exonEnds'])\n+ in_exon = False\n+ for st,en in exon_coords :\n+ if peak_loc >= st and peak_loc <= en :\n+ in_exon = True\n+ break\n+ out_d['map type'] = 'gene'\n+ out_d['map subtype'] = 'exon' if in_exon else 'intron'\n+\n+ #Commented out to keep score reported in bed file - AJD 7/29/14\n+ # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene\n+ #gene_len = float(gene_coords[1]-gene_coords[0])\n+ #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len\n+\n+ # distance calculated from start of gene\n+ out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n+\n+ map_stats[out_d['map subtype']] += 1\n+\n+ # check for downstream\n+ elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :\n+ out_d['map type'] = 'after'\n+ if opts.tss :\n+ out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc\n+ else :\n+ out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc\n+\n+ # does not map to this gene\n+ else :\n+ pass\n+\n+ # map type is not blank if we mapped to something\n+ if out_d['map type'] != '' :\n+\n+ #out_d = {'knownGeneID':gene['name']}\n+ out_d['knownGeneID'] = gene['name']\n+ if opts.symbol_xref :\n+ out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']\n+ peaks_writer.writerow(out_d)\n+ mapped = True\n+\n+ # reset map_type\n+ out_d['map type'] = ''\n+\n+ if not mapped :\n+ if opts.intergenic :\n+ out_d['knownGeneID'] = 'None'\n+ out_d['geneSymbol'] = 'None'\n+ out_d['map type'] = 'intergenic'\n+ peaks_writer.writerow(out_d)\n+ map_stats['intergenic'] += 1\n+\n+ if peak_output != sys.stdout:\n+ peak_output.close()\n+\n+ #if opts.stats_output != sys.stderr :\n+ # opts.stats_output = open(opts.stats_output,'w')\n+\n+ #for k,v in map_stats.items() :\n+ # opts.stats_output.write('%s: %s\\n'%(k,v))\n+\n+ #if opts.stats_output != sys.stderr :\n+ # opts.stats_output.close()\n+\n+\n+if __name__ == '__main__' :\n+ main()\n" |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/map_to_known_genes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/map_to_known_genes.xml Wed Apr 13 17:36:07 2016 -0400 |
[ |
@@ -0,0 +1,44 @@ +<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1"> + <description> + Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: + peak loc, dist from feature, map type, map subtype + </description> + <parallelism method="basic"></parallelism> + <requirements> + <requirement type="package" version="1.0">chipsequtil</requirement> + </requirements> + <command interpreter="python"> + map_to_known_genes.py + $tss + --peaks-format=$peaks_fmt + --upstream-window=$upst_win + --downstream-window=$dnst_win + --map-output="$peaksOutput" + $detail + $intergenic + $knownGeneFile $knownGeneRef $macsPeaksFile + + </command> + <inputs> + <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" /> + <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" /> + <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" /> + + <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" /> + <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" /> + + <param name="tss" checked="true" label="Calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" /> + + <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false"> + <option value="MACS">MACS</option> + <option selected="true" value="BED">BED</option> + </param> + + <param name="detail" checked="true" label="Append the following fields to each row: peak loc, dist from feature, map type, map subtype" type="boolean" truevalue="--detail" falsevalue="" help="" /> + <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" /> + </inputs> + <outputs> + <data name="peaksOutput" format="txt" hidden="false" /> + </outputs> + <help></help> +</tool> |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/tool_dependencies.xml Wed Apr 13 17:36:07 2016 -0400 |
b |
@@ -0,0 +1,17 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="chipsequtil" version="1.0"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://github.com/fraenkel-lab/chipsequtil/archive/master.zip</action> + <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action> + <action type="shell_command">python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin</action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> + <environment_variable name="PYTHONPATH" action="prepend_to">$INSTALL_DIR/lib/python</environment_variable> + </action> + </actions> + </install> + <readme></readme> + </package> +</tool_dependency> |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/pieplot_macs/pieplots_macs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/pieplot_macs/pieplots_macs.py Wed Apr 13 17:36:07 2016 -0400 |
[ |
@@ -0,0 +1,127 @@ +''' + NAME + + pieplots_macs.py + + SYNOPSIS + +python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf + + + DESCRIPTION + +Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela) + +''' + +__author__='Renan Escalante' +__email__='renanec@mit.edu' + +import matplotlib +matplotlib.use('pdf') +from matplotlib import pyplot as plt +matplotlib.rcParams['pdf.fonttype']=42 +matplotlib.rcParams['font.size']=14 +import sys +from argparse import ArgumentParser +import pandas as pd + +def map_peaks(gene,peak,outfile,macsFlag): + genefile = open(gene, 'r') + peakfile = open(peak, 'r') + + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0} + + #read mapped gene file, store closest map for each peak + peaks={} #{chrom:{peakStart:[dist, type]}} + for line in genefile: + words = line.strip().split('\t') + #ignore first line + if words[0] == 'knownGeneID': continue + chrom = words[2] + + + if not macsFlag: + try: + start = int(words[3]) + dist = abs(int(words[15])) + maptype = words[16] + if maptype == 'gene': + maptype = words[17] + except: + pass + + else: + start = int(words[3])-1 + dist = abs(int(words[12])) + maptype = words[14] + if maptype == 'gene': + maptype = words[15] + + + if chrom not in peaks: + #new chrom + peaks[chrom] = {start:[dist,maptype]} + else: + if start in peaks[chrom].keys(): + #account for duplicate entries - choose closest gene and store type + if dist < peaks[chrom][start][0]: + #closer gene + peaks[chrom][start] = [dist, maptype] + else: peaks[chrom][start] = [dist, maptype] + + #count types - 1 per peak in peak file + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0} + totalpks = 0 + #Read peak file in bed format + for line in peakfile: + words = line.strip().split('\t') + chrom = words[0] + start = int(words[1]) + if chrom in peaks: + if start in peaks[chrom]: + types[peaks[chrom][start][1]] += 1 + else: + types['inter'] += 1 + else: + types['inter'] += 1 + totalpks += 1 + + + #-------------------------------------------- + # make a square figure and axes + #-------------------------------------------- + + fig = plt.figure(figsize=(6,6)) + pie_ax = fig.add_axes((0.3,0.2,0.4,0.4)) + + # The slices will be ordered and plotted counter-clockwise. + labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']] + fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']] + + plt.pie(fracs, labels=labels) #, autopct='%1.1f%%') + + #Export data frame with all the counts + indexDataFrame = ['exon','intron','promoter','intergenic','after'] + df = pd.DataFrame(data=fracs, index=indexDataFrame) + dfFileName = outfile.replace("pdf","csv") + df.to_csv(dfFileName, sep=',') + #plt.title('MACS peaks in %s'%(name)) + plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center') + fig.savefig(outfile) + +def main(): + usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf" + parser = ArgumentParser(usage) + parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt") + parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed") + parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart") + parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks') + + args=parser.parse_args() + + map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS) + + +if __name__=='__main__': + main() |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/pieplot_macs/pieplots_macs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/pieplot_macs/pieplots_macs.xml Wed Apr 13 17:36:07 2016 -0400 |
b |
@@ -0,0 +1,22 @@ +<tool id="pieplots_macs" name="Pieplots MACS" version="0.1"> + <description> + Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. + </description> + <parallelism method="basic"></parallelism> + <requirements> + <requirement type="package" version="1.2.1">matplotlib</requirement> + <requirement type="package">pandas</requirement> + </requirements> + <command interpreter="python"> + pieplots_macs.py genefile $genefilepeakfile $peakfile $MACSoutfile $out + </command> + <inputs> + <param name="genefile" type="data" label="genefile file" help="" optional="false" /> + <param name="peakfile" type="data" label="peakfile xRef file" help="" optional="false" /> + <param name="MACS" checked="false" label="Set this value if you have MACS peaks" type="boolean" truevalue="--MACS" falsevalue="" help="" /> + </inputs> + <outputs> + <data name="out" format="pdf" hidden="false" /> + </outputs> + <help></help> +</tool> |
b |
diff -r 248c4538e4ce -r 63dace20719b chipsequtil/tool_dependencies.xml --- a/chipsequtil/tool_dependencies.xml Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,17 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="chipsequtil" version="1.0"> - <install version="1.0"> - <actions> - <action type="download_by_url">https://github.com/fraenkel-lab/chipsequtil/archive/master.zip</action> - <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action> - <action type="shell_command">python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin</action> - <action type="set_environment"> - <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> - <environment_variable name="PYTHONPATH" action="prepend_to">$INSTALL_DIR/lib/python</environment_variable> - </action> - </actions> - </install> - <readme></readme> - </package> -</tool_dependency> |