Mercurial > repos > alenail > chipsequtil
comparison chipsequtil/map_to_known_genes.py @ 0:4f66143b385a draft
Uploaded
author | alenail |
---|---|
date | Mon, 28 Mar 2016 13:08:59 -0400 |
parents | |
children | 0eaa8225a09a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f66143b385a |
---|---|
1 #!/usr/local/bin/python | |
2 | |
3 import sys, os | |
4 from optparse import OptionParser | |
5 from collections import defaultdict as dd | |
6 from csv import DictReader, DictWriter | |
7 | |
8 from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number | |
9 from chipsequtil.util import MultiLineHelpFormatter | |
10 | |
11 usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>' | |
12 description = """ | |
13 Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\ | |
14 format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ | |
15 <peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \ | |
16 is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \ | |
17 option is provided, the following extra fields are appended to each row: | |
18 | |
19 peak loc, dist from feature, map type, map subtype | |
20 """ | |
21 epilog = '' | |
22 parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) | |
23 parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') | |
24 parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') | |
25 parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') | |
26 parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') | |
27 parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') | |
28 parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') | |
29 parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') | |
30 parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') | |
31 #parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') | |
32 | |
33 # TODO - options | |
34 #parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') | |
35 #parser.add_option('--capture-intergenic'...) | |
36 #parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') | |
37 #parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') | |
38 | |
39 def parse_gene_ref(ref_gene) : | |
40 reader = KnownGeneFile(ref_gene) | |
41 gene_ref = dd(list) | |
42 for ref_dict in reader : | |
43 gene_ref[ref_dict['chrom']].append(ref_dict) | |
44 | |
45 return gene_ref | |
46 | |
47 def parse_gene_ref_line(l) : | |
48 l = map(parse_number, l) # coerce to numbers where possible | |
49 l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list | |
50 l[10] = map(parse_number, l[10].split(',')) | |
51 return l | |
52 | |
53 if __name__ == '__main__' : | |
54 | |
55 opts, args = parser.parse_args(sys.argv[1:]) | |
56 | |
57 if len(args) < 3 : | |
58 parser.error('Must provide three filename arguments') | |
59 | |
60 gene_ref = parse_gene_ref(args[0]) | |
61 xref_fn = args[1] | |
62 peaks_fn = args[2] | |
63 if opts.peaks_fmt == 'auto' : | |
64 path,ext = os.path.splitext(peaks_fn) | |
65 if ext.lower() == '.xls' : | |
66 opts.peaks_fmt = 'MACS' | |
67 elif ext.lower() == '.bed' : | |
68 opts.peaks_fmt = 'BED' | |
69 elif ext.lower() == '.narrowpeak' : | |
70 opts.peaks_fmt = 'BED' | |
71 else : | |
72 parser.error('Could not guess peaks file format by extension (%s), aborting'%ext) | |
73 | |
74 if opts.peaks_fmt == 'MACS' : | |
75 peaks_reader_cls = MACSFile | |
76 chr_field, start_field, end_field = 'chr', 'start', 'end' | |
77 elif opts.peaks_fmt == 'BED' : | |
78 peaks_reader_cls = BEDFile | |
79 chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' | |
80 else : | |
81 # should never happen | |
82 fieldnames = [] | |
83 | |
84 #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') | |
85 peaks_reader = peaks_reader_cls(peaks_fn) | |
86 | |
87 # default output format: | |
88 if opts.peak_output : | |
89 peak_output = open(opts.peak_output,'w') | |
90 else : | |
91 peak_output = sys.stdout | |
92 | |
93 fieldnames = peaks_reader.FIELD_NAMES | |
94 if opts.detail : | |
95 fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score" | |
96 output_fields = ['knownGeneID']+fieldnames | |
97 | |
98 # see if the user wants gene symbols too | |
99 # TODO - actually make this an option, or make it required | |
100 opts.symbol_xref = xref_fn | |
101 if opts.symbol_xref : | |
102 kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] | |
103 symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') | |
104 symbol_xref_map = {} | |
105 for rec in symbol_xref_reader : | |
106 symbol_xref_map[rec['kgID']] = rec | |
107 output_fields = ['knownGeneID','geneSymbol']+fieldnames | |
108 | |
109 peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') | |
110 peaks_writer.writerow(dict([(k,k) for k in output_fields])) | |
111 unique_genes = set() | |
112 map_stats = dd(int) | |
113 for peak in peaks_reader : | |
114 | |
115 # if this is a comment or header line get skip it | |
116 if peak[fieldnames[0]].startswith('#') or \ | |
117 peak[fieldnames[0]] == fieldnames[0] or \ | |
118 peak[fieldnames[0]].startswith('track') : continue | |
119 | |
120 # coerce values to numeric if possible | |
121 for k,v in peak.items() : peak[k] = parse_number(v) | |
122 | |
123 # MACS output gives us summit | |
124 if opts.peaks_fmt == 'MACS' : | |
125 peak_loc = peak[start_field]+peak['summit'] | |
126 else : # peak assumed to be in the middle of the reported peak range | |
127 peak_loc = (peak[start_field]+peak[end_field])/2 | |
128 | |
129 chrom_genes = gene_ref[peak[chr_field]] | |
130 | |
131 if len(chrom_genes) == 0 : | |
132 sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) | |
133 continue | |
134 | |
135 mapped = False | |
136 | |
137 # walk through the genes for this chromosome | |
138 for gene in chrom_genes : | |
139 | |
140 # reusable dictionary for output | |
141 out_d = {}.fromkeys(output_fields,0) | |
142 out_d.update(peak) | |
143 out_d['map type'] = '' | |
144 out_d['chromo'] = peak[chr_field] | |
145 out_d['peak loc'] = peak_loc | |
146 | |
147 # determine intervals for promoter, gene, and downstream | |
148 if gene['strand'] == '+' : | |
149 promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 | |
150 if opts.tss : | |
151 gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) | |
152 downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win | |
153 else : | |
154 gene_coords = gene['txStart'], gene['txEnd'] | |
155 downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win | |
156 else : | |
157 promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing | |
158 if opts.tss : | |
159 gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] | |
160 downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing | |
161 else : | |
162 gene_coords = gene['txStart'], gene['txEnd'] | |
163 downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing | |
164 | |
165 # check for promoter | |
166 if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : | |
167 out_d['map type'] = 'promoter' | |
168 out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc | |
169 | |
170 # check for gene | |
171 elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : | |
172 # check for intron/exon | |
173 exon_coords = zip(gene['exonStarts'],gene['exonEnds']) | |
174 in_exon = False | |
175 for st,en in exon_coords : | |
176 if peak_loc >= st and peak_loc <= en : | |
177 in_exon = True | |
178 break | |
179 out_d['map type'] = 'gene' | |
180 out_d['map subtype'] = 'exon' if in_exon else 'intron' | |
181 | |
182 #Commented out to keep score reported in bed file - AJD 7/29/14 | |
183 # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene | |
184 #gene_len = float(gene_coords[1]-gene_coords[0]) | |
185 #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len | |
186 | |
187 # distance calculated from start of gene | |
188 out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc | |
189 | |
190 map_stats[out_d['map subtype']] += 1 | |
191 | |
192 # check for downstream | |
193 elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : | |
194 out_d['map type'] = 'after' | |
195 if opts.tss : | |
196 out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc | |
197 else : | |
198 out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc | |
199 | |
200 # does not map to this gene | |
201 else : | |
202 pass | |
203 | |
204 # map type is not blank if we mapped to something | |
205 if out_d['map type'] != '' : | |
206 | |
207 #out_d = {'knownGeneID':gene['name']} | |
208 out_d['knownGeneID'] = gene['name'] | |
209 if opts.symbol_xref : | |
210 out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] | |
211 peaks_writer.writerow(out_d) | |
212 | |
213 mapped = True | |
214 | |
215 # reset map_type | |
216 out_d['map type'] = '' | |
217 | |
218 if not mapped : | |
219 if opts.intergenic : | |
220 out_d['knownGeneID'] = 'None' | |
221 out_d['geneSymbol'] = 'None' | |
222 out_d['map type'] = 'intergenic' | |
223 peaks_writer.writerow(out_d) | |
224 map_stats['intergenic'] += 1 | |
225 | |
226 if peak_output != sys.stdout : | |
227 peak_output.close() | |
228 | |
229 #if opts.stats_output != sys.stderr : | |
230 # opts.stats_output = open(opts.stats_output,'w') | |
231 | |
232 #for k,v in map_stats.items() : | |
233 # opts.stats_output.write('%s: %s\n'%(k,v)) | |
234 | |
235 #if opts.stats_output != sys.stderr : | |
236 # opts.stats_output.close() |