0
|
1 #!/usr/local/bin/python
|
|
2
|
|
3 import sys, os
|
|
4 from optparse import OptionParser
|
|
5 from collections import defaultdict as dd
|
|
6 from csv import DictReader, DictWriter
|
|
7
|
|
8 from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number
|
|
9 from chipsequtil.util import MultiLineHelpFormatter
|
|
10
|
|
11 usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>'
|
|
12 description = """
|
|
13 Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\
|
|
14 format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\
|
|
15 <peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \
|
|
16 is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \
|
|
17 option is provided, the following extra fields are appended to each row:
|
|
18
|
|
19 peak loc, dist from feature, map type, map subtype
|
|
20 """
|
|
21 epilog = ''
|
|
22 parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter())
|
|
23 parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]')
|
|
24 parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]')
|
|
25 parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site')
|
|
26 parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]')
|
|
27 parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]')
|
|
28 parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]')
|
|
29 parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description')
|
|
30 parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID')
|
|
31 #parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column')
|
|
32
|
|
33 # TODO - options
|
|
34 #parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping')
|
|
35 #parser.add_option('--capture-intergenic'...)
|
|
36 #parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]')
|
|
37 #parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]')
|
|
38
|
|
39 def parse_gene_ref(ref_gene) :
|
|
40 reader = KnownGeneFile(ref_gene)
|
|
41 gene_ref = dd(list)
|
|
42 for ref_dict in reader :
|
|
43 gene_ref[ref_dict['chrom']].append(ref_dict)
|
|
44
|
|
45 return gene_ref
|
|
46
|
|
47 def parse_gene_ref_line(l) :
|
|
48 l = map(parse_number, l) # coerce to numbers where possible
|
|
49 l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list
|
|
50 l[10] = map(parse_number, l[10].split(','))
|
|
51 return l
|
|
52
|
|
53 if __name__ == '__main__' :
|
|
54
|
|
55 opts, args = parser.parse_args(sys.argv[1:])
|
|
56
|
|
57 if len(args) < 3 :
|
|
58 parser.error('Must provide three filename arguments')
|
|
59
|
|
60 gene_ref = parse_gene_ref(args[0])
|
|
61 xref_fn = args[1]
|
|
62 peaks_fn = args[2]
|
|
63 if opts.peaks_fmt == 'auto' :
|
|
64 path,ext = os.path.splitext(peaks_fn)
|
|
65 if ext.lower() == '.xls' :
|
|
66 opts.peaks_fmt = 'MACS'
|
|
67 elif ext.lower() == '.bed' :
|
|
68 opts.peaks_fmt = 'BED'
|
|
69 elif ext.lower() == '.narrowpeak' :
|
|
70 opts.peaks_fmt = 'BED'
|
|
71 else :
|
|
72 parser.error('Could not guess peaks file format by extension (%s), aborting'%ext)
|
|
73
|
|
74 if opts.peaks_fmt == 'MACS' :
|
|
75 peaks_reader_cls = MACSFile
|
|
76 chr_field, start_field, end_field = 'chr', 'start', 'end'
|
|
77 elif opts.peaks_fmt == 'BED' :
|
|
78 peaks_reader_cls = BEDFile
|
|
79 chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd'
|
|
80 else :
|
|
81 # should never happen
|
|
82 fieldnames = []
|
|
83
|
|
84 #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t')
|
|
85 peaks_reader = peaks_reader_cls(peaks_fn)
|
|
86
|
|
87 # default output format:
|
|
88 if opts.peak_output :
|
|
89 peak_output = open(opts.peak_output,'w')
|
|
90 else :
|
|
91 peak_output = sys.stdout
|
|
92
|
|
93 fieldnames = peaks_reader.FIELD_NAMES
|
|
94 if opts.detail :
|
|
95 fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score"
|
|
96 output_fields = ['knownGeneID']+fieldnames
|
|
97
|
|
98 # see if the user wants gene symbols too
|
|
99 # TODO - actually make this an option, or make it required
|
|
100 opts.symbol_xref = xref_fn
|
|
101 if opts.symbol_xref :
|
|
102 kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description']
|
|
103 symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t')
|
|
104 symbol_xref_map = {}
|
|
105 for rec in symbol_xref_reader :
|
|
106 symbol_xref_map[rec['kgID']] = rec
|
|
107 output_fields = ['knownGeneID','geneSymbol']+fieldnames
|
|
108
|
|
109 peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n')
|
|
110 peaks_writer.writerow(dict([(k,k) for k in output_fields]))
|
|
111 unique_genes = set()
|
|
112 map_stats = dd(int)
|
|
113 for peak in peaks_reader :
|
|
114
|
|
115 # if this is a comment or header line get skip it
|
|
116 if peak[fieldnames[0]].startswith('#') or \
|
|
117 peak[fieldnames[0]] == fieldnames[0] or \
|
|
118 peak[fieldnames[0]].startswith('track') : continue
|
|
119
|
|
120 # coerce values to numeric if possible
|
|
121 for k,v in peak.items() : peak[k] = parse_number(v)
|
|
122
|
|
123 # MACS output gives us summit
|
|
124 if opts.peaks_fmt == 'MACS' :
|
|
125 peak_loc = peak[start_field]+peak['summit']
|
|
126 else : # peak assumed to be in the middle of the reported peak range
|
|
127 peak_loc = (peak[start_field]+peak[end_field])/2
|
|
128
|
|
129 chrom_genes = gene_ref[peak[chr_field]]
|
|
130
|
|
131 if len(chrom_genes) == 0 :
|
|
132 sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak))
|
|
133 continue
|
|
134
|
|
135 mapped = False
|
|
136
|
|
137 # walk through the genes for this chromosome
|
|
138 for gene in chrom_genes :
|
|
139
|
|
140 # reusable dictionary for output
|
|
141 out_d = {}.fromkeys(output_fields,0)
|
|
142 out_d.update(peak)
|
|
143 out_d['map type'] = ''
|
|
144 out_d['chromo'] = peak[chr_field]
|
|
145 out_d['peak loc'] = peak_loc
|
|
146
|
|
147 # determine intervals for promoter, gene, and downstream
|
|
148 if gene['strand'] == '+' :
|
|
149 promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1
|
|
150 if opts.tss :
|
|
151 gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win)
|
|
152 downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win
|
|
153 else :
|
|
154 gene_coords = gene['txStart'], gene['txEnd']
|
|
155 downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win
|
|
156 else :
|
|
157 promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing
|
|
158 if opts.tss :
|
|
159 gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']
|
|
160 downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
|
|
161 else :
|
|
162 gene_coords = gene['txStart'], gene['txEnd']
|
|
163 downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
|
|
164
|
|
165 # check for promoter
|
|
166 if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :
|
|
167 out_d['map type'] = 'promoter'
|
|
168 out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
|
|
169
|
|
170 # check for gene
|
|
171 elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :
|
|
172 # check for intron/exon
|
|
173 exon_coords = zip(gene['exonStarts'],gene['exonEnds'])
|
|
174 in_exon = False
|
|
175 for st,en in exon_coords :
|
|
176 if peak_loc >= st and peak_loc <= en :
|
|
177 in_exon = True
|
|
178 break
|
|
179 out_d['map type'] = 'gene'
|
|
180 out_d['map subtype'] = 'exon' if in_exon else 'intron'
|
|
181
|
|
182 #Commented out to keep score reported in bed file - AJD 7/29/14
|
|
183 # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene
|
|
184 #gene_len = float(gene_coords[1]-gene_coords[0])
|
|
185 #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len
|
|
186
|
|
187 # distance calculated from start of gene
|
|
188 out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc
|
|
189
|
|
190 map_stats[out_d['map subtype']] += 1
|
|
191
|
|
192 # check for downstream
|
|
193 elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :
|
|
194 out_d['map type'] = 'after'
|
|
195 if opts.tss :
|
|
196 out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc
|
|
197 else :
|
|
198 out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc
|
|
199
|
|
200 # does not map to this gene
|
|
201 else :
|
|
202 pass
|
|
203
|
|
204 # map type is not blank if we mapped to something
|
|
205 if out_d['map type'] != '' :
|
|
206
|
|
207 #out_d = {'knownGeneID':gene['name']}
|
|
208 out_d['knownGeneID'] = gene['name']
|
|
209 if opts.symbol_xref :
|
|
210 out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']
|
|
211 peaks_writer.writerow(out_d)
|
|
212
|
|
213 mapped = True
|
|
214
|
|
215 # reset map_type
|
|
216 out_d['map type'] = ''
|
|
217
|
|
218 if not mapped :
|
|
219 if opts.intergenic :
|
|
220 out_d['knownGeneID'] = 'None'
|
|
221 out_d['geneSymbol'] = 'None'
|
|
222 out_d['map type'] = 'intergenic'
|
|
223 peaks_writer.writerow(out_d)
|
|
224 map_stats['intergenic'] += 1
|
|
225
|
|
226 if peak_output != sys.stdout :
|
|
227 peak_output.close()
|
|
228
|
|
229 #if opts.stats_output != sys.stderr :
|
|
230 # opts.stats_output = open(opts.stats_output,'w')
|
|
231
|
|
232 #for k,v in map_stats.items() :
|
|
233 # opts.stats_output.write('%s: %s\n'%(k,v))
|
|
234
|
|
235 #if opts.stats_output != sys.stderr :
|
|
236 # opts.stats_output.close()
|