# HG changeset patch # User alenail # Date 1460583367 14400 # Node ID 63dace20719b0cee1ca26be7c77c0f29000f70e2 # Parent 248c4538e4ce5fe9ab4e748bb78a5938c395c9f7 Uploaded diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes.py --- a/chipsequtil/map_to_known_genes.py Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ -#!/usr/local/bin/python - -import sys, os -from optparse import OptionParser -from collections import defaultdict as dd -from csv import DictReader, DictWriter - -from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number -from chipsequtil.util import MultiLineHelpFormatter - -usage = '%prog [options] ' -description = """ -Map the peaks in to genes in . is\ -format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ - format is as produced by MACS. If *auto* is chosen (default) file extension \ -is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \ -option is provided, the following extra fields are appended to each row: - -peak loc, dist from feature, map type, map subtype -""" -epilog = '' -parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) -parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') -parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') -parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') -parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') -parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') -parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') -parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') -parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') -#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') - -# TODO - options -#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') -#parser.add_option('--capture-intergenic'...) -#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') -#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') - -def parse_gene_ref(ref_gene) : - reader = KnownGeneFile(ref_gene) - gene_ref = dd(list) - for ref_dict in reader : - gene_ref[ref_dict['chrom']].append(ref_dict) - - return gene_ref - - -def parse_gene_ref_line(l) : - l = map(parse_number, l) # coerce to numbers where possible - l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list - l[10] = map(parse_number, l[10].split(',')) - return l - - -def main(): - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 3 : - parser.error('Must provide three filename arguments') - - gene_ref = parse_gene_ref(args[0]) - xref_fn = args[1] - peaks_fn = args[2] - - if opts.peaks_fmt == 'MACS' : - peaks_reader_cls = MACSFile - chr_field, start_field, end_field = 'chr', 'start', 'end' - elif opts.peaks_fmt == 'BED' : - peaks_reader_cls = BEDFile - chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' - else : - # should never happen - fieldnames = [] - - #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') - peaks_reader = peaks_reader_cls(peaks_fn) - - # default output format: - if opts.peak_output : - peak_output = open(opts.peak_output,'w') - else : - peak_output = sys.stdout - - fieldnames = peaks_reader.FIELD_NAMES - if opts.detail : - fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score" - output_fields = ['knownGeneID']+fieldnames - - # see if the user wants gene symbols too - # TODO - actually make this an option, or make it required - opts.symbol_xref = xref_fn - if opts.symbol_xref : - kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] - symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') - symbol_xref_map = {} - for rec in symbol_xref_reader : - symbol_xref_map[rec['kgID']] = rec - output_fields = ['knownGeneID','geneSymbol']+fieldnames - - peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') - peaks_writer.writerow(dict([(k,k) for k in output_fields])) - unique_genes = set() - map_stats = dd(int) - for peak in peaks_reader : - - # if this is a comment or header line get skip it - if peak[fieldnames[0]].startswith('#') or \ - peak[fieldnames[0]] == fieldnames[0] or \ - peak[fieldnames[0]].startswith('track') : continue - - # coerce values to numeric if possible - for k,v in peak.items() : peak[k] = parse_number(v) - - # MACS output gives us summit - if opts.peaks_fmt == 'MACS' : - peak_loc = peak[start_field]+peak['summit'] - else : # peak assumed to be in the middle of the reported peak range - peak_loc = (peak[start_field]+peak[end_field])/2 - - chrom_genes = gene_ref[peak[chr_field]] - - if len(chrom_genes) == 0 : - sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) - continue - - mapped = False - - # walk through the genes for this chromosome - for gene in chrom_genes : - - # reusable dictionary for output - out_d = {}.fromkeys(output_fields,0) - out_d.update(peak) - out_d['map type'] = '' - out_d['chromo'] = peak[chr_field] - out_d['peak loc'] = peak_loc - - # determine intervals for promoter, gene, and downstream - if gene['strand'] == '+' : - promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 - if opts.tss : - gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) - downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win - else : - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win - else : - promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing - if opts.tss : - gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] - downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing - else : - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing - - # check for promoter - if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : - out_d['map type'] = 'promoter' - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - # check for gene - elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : - # check for intron/exon - exon_coords = zip(gene['exonStarts'],gene['exonEnds']) - in_exon = False - for st,en in exon_coords : - if peak_loc >= st and peak_loc <= en : - in_exon = True - break - out_d['map type'] = 'gene' - out_d['map subtype'] = 'exon' if in_exon else 'intron' - - #Commented out to keep score reported in bed file - AJD 7/29/14 - # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene - #gene_len = float(gene_coords[1]-gene_coords[0]) - #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len - - # distance calculated from start of gene - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - map_stats[out_d['map subtype']] += 1 - - # check for downstream - elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : - out_d['map type'] = 'after' - if opts.tss : - out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc - else : - out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc - - # does not map to this gene - else : - pass - - # map type is not blank if we mapped to something - if out_d['map type'] != '' : - - #out_d = {'knownGeneID':gene['name']} - out_d['knownGeneID'] = gene['name'] - if opts.symbol_xref : - out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] - peaks_writer.writerow(out_d) - mapped = True - - # reset map_type - out_d['map type'] = '' - - if not mapped : - if opts.intergenic : - out_d['knownGeneID'] = 'None' - out_d['geneSymbol'] = 'None' - out_d['map type'] = 'intergenic' - peaks_writer.writerow(out_d) - map_stats['intergenic'] += 1 - - if peak_output != sys.stdout: - peak_output.close() - - #if opts.stats_output != sys.stderr : - # opts.stats_output = open(opts.stats_output,'w') - - #for k,v in map_stats.items() : - # opts.stats_output.write('%s: %s\n'%(k,v)) - - #if opts.stats_output != sys.stderr : - # opts.stats_output.close() - - -if __name__ == '__main__' : - main() diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes.xml --- a/chipsequtil/map_to_known_genes.xml Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - - Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: - peak loc, dist from feature, map type, map subtype - - - - chipsequtil - - - map_to_known_genes.py - $tss - --peaks-format=$peaks_fmt - --upstream-window=$upst_win - --downstream-window=$dnst_win - --map-output="$peaksOutput" - $detail - $intergenic - $knownGeneFile $knownGeneRef $macsPeaksFile - - - - - - - - - - - - - - - - - - - - - - - - diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/._map_to_known_genes.py Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.py has changed diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/._map_to_known_genes.xml Binary file chipsequtil/map_to_known_genes/._map_to_known_genes.xml has changed diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/map_to_known_genes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/map_to_known_genes.py Wed Apr 13 17:36:07 2016 -0400 @@ -0,0 +1,230 @@ +#!/usr/local/bin/python + +import sys, os +from optparse import OptionParser +from collections import defaultdict as dd +from csv import DictReader, DictWriter + +from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number +from chipsequtil.util import MultiLineHelpFormatter + +usage = '%prog [options] ' +description = """ +Map the peaks in to genes in . is\ +format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ + format is as produced by MACS. If *auto* is chosen (default) file extension \ +is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \ +option is provided, the following extra fields are appended to each row: + +peak loc, dist from feature, map type, map subtype +""" +epilog = '' +parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) +parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') +parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') +parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') +parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') +parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') +parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') +parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') +parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') +#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') + +# TODO - options +#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') +#parser.add_option('--capture-intergenic'...) +#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') +#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') + +def parse_gene_ref(ref_gene) : + reader = KnownGeneFile(ref_gene) + gene_ref = dd(list) + for ref_dict in reader : + gene_ref[ref_dict['chrom']].append(ref_dict) + + return gene_ref + + +def parse_gene_ref_line(l) : + l = map(parse_number, l) # coerce to numbers where possible + l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list + l[10] = map(parse_number, l[10].split(',')) + return l + + +def main(): + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 3 : + parser.error('Must provide three filename arguments') + + gene_ref = parse_gene_ref(args[0]) + xref_fn = args[1] + peaks_fn = args[2] + + if opts.peaks_fmt == 'MACS' : + peaks_reader_cls = MACSFile + chr_field, start_field, end_field = 'chr', 'start', 'end' + elif opts.peaks_fmt == 'BED' : + peaks_reader_cls = BEDFile + chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' + else : + # should never happen + fieldnames = [] + + #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') + peaks_reader = peaks_reader_cls(peaks_fn) + + # default output format: + if opts.peak_output : + peak_output = open(opts.peak_output,'w') + else : + peak_output = sys.stdout + + fieldnames = peaks_reader.FIELD_NAMES + if opts.detail : + fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score" + output_fields = ['knownGeneID']+fieldnames + + # see if the user wants gene symbols too + # TODO - actually make this an option, or make it required + opts.symbol_xref = xref_fn + if opts.symbol_xref : + kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] + symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') + symbol_xref_map = {} + for rec in symbol_xref_reader : + symbol_xref_map[rec['kgID']] = rec + output_fields = ['knownGeneID','geneSymbol']+fieldnames + + peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') + peaks_writer.writerow(dict([(k,k) for k in output_fields])) + unique_genes = set() + map_stats = dd(int) + for peak in peaks_reader : + + # if this is a comment or header line get skip it + if peak[fieldnames[0]].startswith('#') or \ + peak[fieldnames[0]] == fieldnames[0] or \ + peak[fieldnames[0]].startswith('track') : continue + + # coerce values to numeric if possible + for k,v in peak.items() : peak[k] = parse_number(v) + + # MACS output gives us summit + if opts.peaks_fmt == 'MACS' : + peak_loc = peak[start_field]+peak['summit'] + else : # peak assumed to be in the middle of the reported peak range + peak_loc = (peak[start_field]+peak[end_field])/2 + + chrom_genes = gene_ref[peak[chr_field]] + + if len(chrom_genes) == 0 : + sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) + continue + + mapped = False + + # walk through the genes for this chromosome + for gene in chrom_genes : + + # reusable dictionary for output + out_d = {}.fromkeys(output_fields,0) + out_d.update(peak) + out_d['map type'] = '' + out_d['chromo'] = peak[chr_field] + out_d['peak loc'] = peak_loc + + # determine intervals for promoter, gene, and downstream + if gene['strand'] == '+' : + promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 + if opts.tss : + gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) + downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win + else : + promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing + if opts.tss : + gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] + downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + + # check for promoter + if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : + out_d['map type'] = 'promoter' + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + # check for gene + elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : + # check for intron/exon + exon_coords = zip(gene['exonStarts'],gene['exonEnds']) + in_exon = False + for st,en in exon_coords : + if peak_loc >= st and peak_loc <= en : + in_exon = True + break + out_d['map type'] = 'gene' + out_d['map subtype'] = 'exon' if in_exon else 'intron' + + #Commented out to keep score reported in bed file - AJD 7/29/14 + # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene + #gene_len = float(gene_coords[1]-gene_coords[0]) + #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len + + # distance calculated from start of gene + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + map_stats[out_d['map subtype']] += 1 + + # check for downstream + elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : + out_d['map type'] = 'after' + if opts.tss : + out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc + else : + out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc + + # does not map to this gene + else : + pass + + # map type is not blank if we mapped to something + if out_d['map type'] != '' : + + #out_d = {'knownGeneID':gene['name']} + out_d['knownGeneID'] = gene['name'] + if opts.symbol_xref : + out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] + peaks_writer.writerow(out_d) + mapped = True + + # reset map_type + out_d['map type'] = '' + + if not mapped : + if opts.intergenic : + out_d['knownGeneID'] = 'None' + out_d['geneSymbol'] = 'None' + out_d['map type'] = 'intergenic' + peaks_writer.writerow(out_d) + map_stats['intergenic'] += 1 + + if peak_output != sys.stdout: + peak_output.close() + + #if opts.stats_output != sys.stderr : + # opts.stats_output = open(opts.stats_output,'w') + + #for k,v in map_stats.items() : + # opts.stats_output.write('%s: %s\n'%(k,v)) + + #if opts.stats_output != sys.stderr : + # opts.stats_output.close() + + +if __name__ == '__main__' : + main() diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/map_to_known_genes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/map_to_known_genes.xml Wed Apr 13 17:36:07 2016 -0400 @@ -0,0 +1,44 @@ + + + Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: + peak loc, dist from feature, map type, map subtype + + + + chipsequtil + + + map_to_known_genes.py + $tss + --peaks-format=$peaks_fmt + --upstream-window=$upst_win + --downstream-window=$dnst_win + --map-output="$peaksOutput" + $detail + $intergenic + $knownGeneFile $knownGeneRef $macsPeaksFile + + + + + + + + + + + + + + + + + + + + + + + + diff -r 248c4538e4ce -r 63dace20719b chipsequtil/map_to_known_genes/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes/tool_dependencies.xml Wed Apr 13 17:36:07 2016 -0400 @@ -0,0 +1,17 @@ + + + + + + https://github.com/fraenkel-lab/chipsequtil/archive/master.zip + cp org_settings.cfg src/chipsequtil/ + python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/bin + $INSTALL_DIR/lib/python + + + + + + diff -r 248c4538e4ce -r 63dace20719b chipsequtil/pieplot_macs/pieplots_macs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/pieplot_macs/pieplots_macs.py Wed Apr 13 17:36:07 2016 -0400 @@ -0,0 +1,127 @@ +''' + NAME + + pieplots_macs.py + + SYNOPSIS + +python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf + + + DESCRIPTION + +Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela) + +''' + +__author__='Renan Escalante' +__email__='renanec@mit.edu' + +import matplotlib +matplotlib.use('pdf') +from matplotlib import pyplot as plt +matplotlib.rcParams['pdf.fonttype']=42 +matplotlib.rcParams['font.size']=14 +import sys +from argparse import ArgumentParser +import pandas as pd + +def map_peaks(gene,peak,outfile,macsFlag): + genefile = open(gene, 'r') + peakfile = open(peak, 'r') + + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0} + + #read mapped gene file, store closest map for each peak + peaks={} #{chrom:{peakStart:[dist, type]}} + for line in genefile: + words = line.strip().split('\t') + #ignore first line + if words[0] == 'knownGeneID': continue + chrom = words[2] + + + if not macsFlag: + try: + start = int(words[3]) + dist = abs(int(words[15])) + maptype = words[16] + if maptype == 'gene': + maptype = words[17] + except: + pass + + else: + start = int(words[3])-1 + dist = abs(int(words[12])) + maptype = words[14] + if maptype == 'gene': + maptype = words[15] + + + if chrom not in peaks: + #new chrom + peaks[chrom] = {start:[dist,maptype]} + else: + if start in peaks[chrom].keys(): + #account for duplicate entries - choose closest gene and store type + if dist < peaks[chrom][start][0]: + #closer gene + peaks[chrom][start] = [dist, maptype] + else: peaks[chrom][start] = [dist, maptype] + + #count types - 1 per peak in peak file + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0} + totalpks = 0 + #Read peak file in bed format + for line in peakfile: + words = line.strip().split('\t') + chrom = words[0] + start = int(words[1]) + if chrom in peaks: + if start in peaks[chrom]: + types[peaks[chrom][start][1]] += 1 + else: + types['inter'] += 1 + else: + types['inter'] += 1 + totalpks += 1 + + + #-------------------------------------------- + # make a square figure and axes + #-------------------------------------------- + + fig = plt.figure(figsize=(6,6)) + pie_ax = fig.add_axes((0.3,0.2,0.4,0.4)) + + # The slices will be ordered and plotted counter-clockwise. + labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']] + fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']] + + plt.pie(fracs, labels=labels) #, autopct='%1.1f%%') + + #Export data frame with all the counts + indexDataFrame = ['exon','intron','promoter','intergenic','after'] + df = pd.DataFrame(data=fracs, index=indexDataFrame) + dfFileName = outfile.replace("pdf","csv") + df.to_csv(dfFileName, sep=',') + #plt.title('MACS peaks in %s'%(name)) + plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center') + fig.savefig(outfile) + +def main(): + usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf" + parser = ArgumentParser(usage) + parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt") + parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed") + parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart") + parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks') + + args=parser.parse_args() + + map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS) + + +if __name__=='__main__': + main() diff -r 248c4538e4ce -r 63dace20719b chipsequtil/pieplot_macs/pieplots_macs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/pieplot_macs/pieplots_macs.xml Wed Apr 13 17:36:07 2016 -0400 @@ -0,0 +1,22 @@ + + + Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. + + + + matplotlib + pandas + + + pieplots_macs.py genefile $genefilepeakfile $peakfile $MACSoutfile $out + + + + + + + + + + diff -r 248c4538e4ce -r 63dace20719b chipsequtil/tool_dependencies.xml --- a/chipsequtil/tool_dependencies.xml Mon Apr 04 16:41:59 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ - - - - - - https://github.com/fraenkel-lab/chipsequtil/archive/master.zip - cp org_settings.cfg src/chipsequtil/ - python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin - - $INSTALL_DIR/bin - $INSTALL_DIR/lib/python - - - - - -