Mercurial > repos > iuc > meme_fimo
diff fimo_wrapper.py @ 5:eca84de658b0 draft
Uploaded
author | iuc |
---|---|
date | Fri, 17 Jun 2016 13:15:48 -0400 |
parents | fd522a964017 |
children | cdcc4bb60bc3 |
line wrap: on
line diff
--- a/fimo_wrapper.py Tue Mar 08 08:10:52 2016 -0500 +++ b/fimo_wrapper.py Fri Jun 17 13:15:48 2016 -0400 @@ -12,6 +12,19 @@ DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb") +def get_stderr(tmp_stderr): + tmp_stderr.seek(0) + stderr = '' + try: + while True: + stderr += tmp_stderr.read(BUFFSIZE) + if not stderr or len(stderr) % BUFFSIZE != 0: + break + except OverflowError: + pass + return stderr + + def reverse(sequence): # Reverse sequence string. return sequence[::-1] @@ -43,11 +56,13 @@ parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score') parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store') parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id') +parser.add_argument('--output_separate_motifs', default="no", help='Output one dataset per motif') parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix') parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value') parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand') parser.add_argument('--output_path', dest='output_path', help='Output files directory') -parser.add_argument('--parse_genomic_coord', action='store_true', help='Check each sequence header for UCSC style genomic coordinates') +parser.add_argument('--parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates') +parser.add_argument('--remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates') parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold') parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold') parser.add_argument('--gff_output', dest='gff_output', help='Gff output file') @@ -73,7 +88,7 @@ fimo_cmd_list.append('--no-qvalue') if args.norc: fimo_cmd_list.append('--norc') - if args.parse_genomic_coord: + if args.parse_genomic_coord == 'yes': fimo_cmd_list.append('--parse-genomic-coord') if args.qv_thresh: fimo_cmd_list.append('--qv-thresh') @@ -93,22 +108,66 @@ tmp_stderr = tempfile.NamedTemporaryFile() proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr) returncode = proc.wait() - tmp_stderr.seek(0) - stderr = '' - try: - while True: - stderr += tmp_stderr.read(BUFFSIZE) - if not stderr or len(stderr) % BUFFSIZE != 0: - break - except OverflowError: - pass if returncode != 0: + stderr = get_stderr(tmp_stderr) stop_err(stderr) except Exception, e: stop_err('Error running FIMO:\n%s' % str(e)) shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output) -shutil.move(os.path.join(args.output_path, 'fimo.gff'), args.gff_output) + +gff_file = os.path.join(args.output_path, 'fimo.gff') +if args.remove_duplicate_coords == 'yes': + tmp_stderr = tempfile.NamedTemporaryFile() + # Identify and eliminating identical motif occurrences. These + # are identical if the combination of chrom, start, end and + # motif id are identical. + cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file) + proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) + returncode = proc.wait() + if returncode != 0: + stderr = get_stderr(tmp_stderr) + stop_err(stderr) + # Sort GFF output by a combination of chrom, score, start. + cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file) + proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) + returncode = proc.wait() + if returncode != 0: + stderr = get_stderr(tmp_stderr) + stop_err(stderr) +if args.output_separate_motifs == 'yes': + # Create the collection output directory. + collection_path = (os.path.join(os.getcwd(), 'output')) + # Keep track of motif occurrences. + header_line = None + motif_ids = [] + file_handles = [] + for line in open(gff_file, 'r'): + if line.startswith('#'): + if header_line is None: + header_line = line + continue + items = line.split('\t') + attribute = items[8] + attributes = attribute.split(';') + name = attributes[0] + motif_id = name.split('=')[1] + file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id) + if motif_id in motif_ids: + i = motif_ids.index(motif_id) + fh = file_handles[i] + fh.write(line) + else: + fh = open(file_name, 'wb') + if header_line is not None: + fh.write(header_line) + fh.write(line) + motif_ids.append(motif_id) + file_handles.append(fh) + for file_handle in file_handles: + file_handle.close() +else: + shutil.move(gff_file, args.gff_output) shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output) shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output)