comparison fimo_wrapper.py @ 5:eca84de658b0 draft

Uploaded
author iuc
date Fri, 17 Jun 2016 13:15:48 -0400
parents fd522a964017
children cdcc4bb60bc3
comparison
equal deleted inserted replaced
4:cd54079f0f72 5:eca84de658b0
8 import tempfile 8 import tempfile
9 9
10 BUFFSIZE = 1048576 10 BUFFSIZE = 1048576
11 # Translation table for reverse Complement, with ambiguity codes. 11 # Translation table for reverse Complement, with ambiguity codes.
12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb") 12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb")
13
14
15 def get_stderr(tmp_stderr):
16 tmp_stderr.seek(0)
17 stderr = ''
18 try:
19 while True:
20 stderr += tmp_stderr.read(BUFFSIZE)
21 if not stderr or len(stderr) % BUFFSIZE != 0:
22 break
23 except OverflowError:
24 pass
25 return stderr
13 26
14 27
15 def reverse(sequence): 28 def reverse(sequence):
16 # Reverse sequence string. 29 # Reverse sequence string.
17 return sequence[::-1] 30 return sequence[::-1]
41 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors') 54 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors')
42 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"') 55 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"')
43 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score') 56 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score')
44 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store') 57 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store')
45 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id') 58 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id')
59 parser.add_argument('--output_separate_motifs', default="no", help='Output one dataset per motif')
46 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix') 60 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix')
47 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value') 61 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value')
48 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand') 62 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand')
49 parser.add_argument('--output_path', dest='output_path', help='Output files directory') 63 parser.add_argument('--output_path', dest='output_path', help='Output files directory')
50 parser.add_argument('--parse_genomic_coord', action='store_true', help='Check each sequence header for UCSC style genomic coordinates') 64 parser.add_argument('--parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates')
65 parser.add_argument('--remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates')
51 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold') 66 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold')
52 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold') 67 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold')
53 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file') 68 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file')
54 parser.add_argument('--html_output', dest='html_output', help='HTML output file') 69 parser.add_argument('--html_output', dest='html_output', help='HTML output file')
55 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file') 70 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file')
71 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo) 86 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo)
72 if args.no_qvalue: 87 if args.no_qvalue:
73 fimo_cmd_list.append('--no-qvalue') 88 fimo_cmd_list.append('--no-qvalue')
74 if args.norc: 89 if args.norc:
75 fimo_cmd_list.append('--norc') 90 fimo_cmd_list.append('--norc')
76 if args.parse_genomic_coord: 91 if args.parse_genomic_coord == 'yes':
77 fimo_cmd_list.append('--parse-genomic-coord') 92 fimo_cmd_list.append('--parse-genomic-coord')
78 if args.qv_thresh: 93 if args.qv_thresh:
79 fimo_cmd_list.append('--qv-thresh') 94 fimo_cmd_list.append('--qv-thresh')
80 fimo_cmd_list.append('--thresh %4f' % args.thresh) 95 fimo_cmd_list.append('--thresh %4f' % args.thresh)
81 if args.input_psp is not None: 96 if args.input_psp is not None:
91 106
92 try: 107 try:
93 tmp_stderr = tempfile.NamedTemporaryFile() 108 tmp_stderr = tempfile.NamedTemporaryFile()
94 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr) 109 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr)
95 returncode = proc.wait() 110 returncode = proc.wait()
96 tmp_stderr.seek(0)
97 stderr = ''
98 try:
99 while True:
100 stderr += tmp_stderr.read(BUFFSIZE)
101 if not stderr or len(stderr) % BUFFSIZE != 0:
102 break
103 except OverflowError:
104 pass
105 if returncode != 0: 111 if returncode != 0:
112 stderr = get_stderr(tmp_stderr)
106 stop_err(stderr) 113 stop_err(stderr)
107 except Exception, e: 114 except Exception, e:
108 stop_err('Error running FIMO:\n%s' % str(e)) 115 stop_err('Error running FIMO:\n%s' % str(e))
109 116
110 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output) 117 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output)
111 shutil.move(os.path.join(args.output_path, 'fimo.gff'), args.gff_output) 118
119 gff_file = os.path.join(args.output_path, 'fimo.gff')
120 if args.remove_duplicate_coords == 'yes':
121 tmp_stderr = tempfile.NamedTemporaryFile()
122 # Identify and eliminating identical motif occurrences. These
123 # are identical if the combination of chrom, start, end and
124 # motif id are identical.
125 cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file)
126 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
127 returncode = proc.wait()
128 if returncode != 0:
129 stderr = get_stderr(tmp_stderr)
130 stop_err(stderr)
131 # Sort GFF output by a combination of chrom, score, start.
132 cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file)
133 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
134 returncode = proc.wait()
135 if returncode != 0:
136 stderr = get_stderr(tmp_stderr)
137 stop_err(stderr)
138 if args.output_separate_motifs == 'yes':
139 # Create the collection output directory.
140 collection_path = (os.path.join(os.getcwd(), 'output'))
141 # Keep track of motif occurrences.
142 header_line = None
143 motif_ids = []
144 file_handles = []
145 for line in open(gff_file, 'r'):
146 if line.startswith('#'):
147 if header_line is None:
148 header_line = line
149 continue
150 items = line.split('\t')
151 attribute = items[8]
152 attributes = attribute.split(';')
153 name = attributes[0]
154 motif_id = name.split('=')[1]
155 file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id)
156 if motif_id in motif_ids:
157 i = motif_ids.index(motif_id)
158 fh = file_handles[i]
159 fh.write(line)
160 else:
161 fh = open(file_name, 'wb')
162 if header_line is not None:
163 fh.write(header_line)
164 fh.write(line)
165 motif_ids.append(motif_id)
166 file_handles.append(fh)
167 for file_handle in file_handles:
168 file_handle.close()
169 else:
170 shutil.move(gff_file, args.gff_output)
112 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output) 171 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output)
113 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output) 172 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output)
114 173
115 out_file = open(args.interval_output, 'wb') 174 out_file = open(args.interval_output, 'wb')
116 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value"))) 175 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value")))