Next changeset 1:dfa6d9262b4e (2016-03-30) |
Commit message:
Uploaded |
added:
chipsequtil/map_to_known_genes.py chipsequtil/map_to_known_genes.xml chipsequtil/tool_dependencies.xml |
b |
diff -r 000000000000 -r 4f66143b385a chipsequtil/map_to_known_genes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes.py Mon Mar 28 13:08:59 2016 -0400 |
[ |
b'@@ -0,0 +1,236 @@\n+#!/usr/local/bin/python\n+\n+import sys, os\n+from optparse import OptionParser\n+from collections import defaultdict as dd\n+from csv import DictReader, DictWriter\n+\n+from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number\n+from chipsequtil.util import MultiLineHelpFormatter\n+\n+usage = \'%prog [options] <knownGene file> <knownGene xRef file> <peaks file>\'\n+description = """\n+Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\\\n+format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\\\n+<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \\\n+is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \\\n+option is provided, the following extra fields are appended to each row:\n+\n+peak loc, dist from feature, map type, map subtype\n+"""\n+epilog = \'\'\n+parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())\n+parser.add_option(\'--upstream-window\',dest=\'upst_win\',type=\'int\',default=5500,help=\'window width in base pairs to consider promoter region [default: %default]\')\n+parser.add_option(\'--downstream-window\',dest=\'dnst_win\',type=\'int\',default=2500,help=\'window width in base pairs to consider downstream region [default: %default]\')\n+parser.add_option(\'--tss\',dest=\'tss\',action=\'store_true\',help=\'calculate downstream window from transcription start site instead of transcription end site\')\n+parser.add_option(\'--map-output\',dest=\'peak_output\',default=None,help=\'filename to output mapped peaks to [default: stdout]\')\n+parser.add_option(\'--stats-output\',dest=\'stats_output\',default=sys.stderr,help=\'filename to output summary stats in conversion [default: stderr]\')\n+parser.add_option(\'--peaks-format\',dest=\'peaks_fmt\',default=\'auto\',type=\'choice\',choices=[\'auto\',\'MACS\',\'BED\'],help=\'format of peaks input file [default: %default]\')\n+parser.add_option(\'--detail\',dest=\'detail\',action=\'store_true\',help=\'add extra fields to output, see description\')\n+parser.add_option(\'--intergenic\',dest=\'intergenic\',action=\'store_true\',help=\'write intergenic peaks to the gene file as well with None as gene ID\')\n+#parser.add_option(\'--symbol-xref\',dest=\'symbol_xref\',default=None,help=\'use the kgXref table file supplied to find a gene symbol, output as second column\')\n+\n+# TODO - options\n+#parser.add_option(\'--use-cds\',dest=\'use_cds\',action=\'store_true\',help=\'use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping\')\n+#parser.add_option(\'--capture-intergenic\'...)\n+#parser.add_option(\'--map-format\',dest=\'peak_format\',type=\'choice\',choices=[\'default\',\'BED\'],help=\'format of peak output [default: %default]\')\n+#parser.add_option(\'--stats-format\',dest=\'stats_format\',type=\'choice\',choices=[\'human\',\'python\'],help=\'format of summary stats output [default: %default]\')\n+\n+def parse_gene_ref(ref_gene) :\n+ reader = KnownGeneFile(ref_gene)\n+ gene_ref = dd(list)\n+ for ref_dict in reader :\n+ gene_ref[ref_dict[\'chrom\']].append(ref_dict)\n+\n+ return gene_ref\n+\n+def parse_gene_ref_line(l) :\n+ l = map(parse_number, l) # coerce to numbers where possible\n+ l[9] = map(parse_number, l[9].split(\',\')) # turn \'x,x,x,...\' into list\n+ l[10] = map(parse_number, l[10].split(\',\'))\n+ return l\n+\n+if __name__ == \'__main__\' :\n+\n+ opts, args = parser.parse_args(sys.argv[1:])\n+\n+ if len(args) < 3 :\n+ parser.error(\'Must provide three filename arguments\')\n+\n+ gene_ref = parse_gene_ref(args[0])\n+ xref_fn = args[1]\n+ peaks_fn = args[2]\n+ if opts.peaks_fmt == \'auto\' :\n+ path,ext = os.path.splitext(peaks_fn)\n+ if ext.lower() == \'.xls\' :\n+ opts.peaks_fmt = \'MACS\'\n+ elif ext.lower() == \'.bed\' :\n+ opts.peaks_fmt = \'BED\'\n+ elif ext.lower() == \'.narrowpeak\' :\n+ opts.peaks_fmt = \'BED\'\n+ else :\n+ parser.error(\'Could not guess peaks file format by extension ('..b"e :\n+ gene_coords = gene['txStart'], gene['txEnd']\n+ downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win\n+ else :\n+ promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing\n+ if opts.tss :\n+ gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']\n+ downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n+ else :\n+ gene_coords = gene['txStart'], gene['txEnd']\n+ downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing\n+\n+ # check for promoter\n+ if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :\n+ out_d['map type'] = 'promoter'\n+ out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n+\n+ # check for gene\n+ elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :\n+ # check for intron/exon\n+ exon_coords = zip(gene['exonStarts'],gene['exonEnds'])\n+ in_exon = False\n+ for st,en in exon_coords :\n+ if peak_loc >= st and peak_loc <= en :\n+ in_exon = True\n+ break\n+ out_d['map type'] = 'gene'\n+ out_d['map subtype'] = 'exon' if in_exon else 'intron'\n+\n+ #Commented out to keep score reported in bed file - AJD 7/29/14\n+ # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene\n+ #gene_len = float(gene_coords[1]-gene_coords[0])\n+ #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len\n+\n+ # distance calculated from start of gene\n+ out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc\n+\n+ map_stats[out_d['map subtype']] += 1\n+\n+ # check for downstream\n+ elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :\n+ out_d['map type'] = 'after'\n+ if opts.tss :\n+ out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc\n+ else :\n+ out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc\n+\n+ # does not map to this gene\n+ else :\n+ pass\n+\n+ # map type is not blank if we mapped to something\n+ if out_d['map type'] != '' :\n+\n+ #out_d = {'knownGeneID':gene['name']}\n+ out_d['knownGeneID'] = gene['name']\n+ if opts.symbol_xref :\n+ out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']\n+ peaks_writer.writerow(out_d)\n+\n+ mapped = True\n+\n+ # reset map_type\n+ out_d['map type'] = ''\n+\n+ if not mapped :\n+ if opts.intergenic :\n+ out_d['knownGeneID'] = 'None'\n+ out_d['geneSymbol'] = 'None'\n+ out_d['map type'] = 'intergenic'\n+ peaks_writer.writerow(out_d)\n+ map_stats['intergenic'] += 1\n+\n+ if peak_output != sys.stdout :\n+ peak_output.close()\n+\n+ #if opts.stats_output != sys.stderr :\n+ # opts.stats_output = open(opts.stats_output,'w')\n+\n+ #for k,v in map_stats.items() :\n+ # opts.stats_output.write('%s: %s\\n'%(k,v))\n+\n+ #if opts.stats_output != sys.stderr :\n+ # opts.stats_output.close()\n" |
b |
diff -r 000000000000 -r 4f66143b385a chipsequtil/map_to_known_genes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes.xml Mon Mar 28 13:08:59 2016 -0400 |
[ |
@@ -0,0 +1,46 @@ +<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1"> + <description> + Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: + peak loc, dist from feature, map type, map subtype + </description> + <parallelism method="basic"></parallelism> + <requirements> + <requirement type="package" version="1.0">chipsequtil</requirement> + </requirements> + <command interpreter="python"> + map_to_known_genes.py + $tss + --upstream-window=$upst_win + --downstream-window=$dnst_win + --map-output=$peaksOutput + --peaks-format=$peaks_fmt + $detail + $intergenic + $knownGeneFile $knownGeneRef $macsPeaksFile + + </command> + <inputs> + <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" /> + <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" /> + <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" /> + <param name="peaksOutput" type="text" label="Output filename" help="filename to output mapped peaks to" optional="false" /> + + <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" /> + <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" /> + + <param name="tss" checked="true" label="calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" /> + + <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false"> + <option value="auto">auto</option> + <option value="MACS">MACS</option> + <option selected="true" value="BED">BED</option> + </param> + + <param name="detail" checked="false" label="Add extra fields to output" type="boolean" truevalue="--detail" falsevalue="" help="" /> + <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" /> + </inputs> + <outputs> + <data format="txt" hidden="false" name="default"/> + </outputs> + <help></help> +</tool> |
b |
diff -r 000000000000 -r 4f66143b385a chipsequtil/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/tool_dependencies.xml Mon Mar 28 13:08:59 2016 -0400 |
b |
@@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="chipsequtil" version="1.0"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://github.com/adamlabadorf/chipsequtil/archive/master.zip</action> + <action type="shell_command">unzip chipsequtil-master.zip -d chipsequtil</action> + <action type="shell_command">cd chipsequtil</action> + <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action> + <action type="shell_command">python setup.py install</action> + </actions> + </install> + <readme></readme> + </package> +</tool_dependency> |