Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/gffParser.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
| author | iss |
|---|---|
| date | Mon, 21 Mar 2022 15:23:09 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c6bab5103a14 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 from Bio import SeqIO | |
| 6 from Bio.Seq import Seq | |
| 7 from Bio.SeqRecord import SeqRecord | |
| 8 import ntpath | |
| 9 | |
| 10 version = '1.0' | |
| 11 | |
| 12 | |
| 13 def parse_id(filename): | |
| 14 # get wanted feature IDs | |
| 15 gff_ids = [] | |
| 16 with open(filename, 'r') as in_handle: | |
| 17 for line in in_handle: | |
| 18 line = line.strip() | |
| 19 gff_ids.append(line) | |
| 20 return gff_ids | |
| 21 | |
| 22 | |
| 23 def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir): | |
| 24 # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. | |
| 25 handle = open(fasta_file, "rU") | |
| 26 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | |
| 27 handle.close() | |
| 28 | |
| 29 seq_2_get = {} | |
| 30 with open(coord_file, 'r') as sequeces2get: | |
| 31 for line in sequeces2get: | |
| 32 line = line.split(',') | |
| 33 coords = (int(line[-2]), int(line[-1])) | |
| 34 contig_id = line[0] | |
| 35 if contig_id in list(seq_2_get.keys()): | |
| 36 seq_2_get[contig_id].append(coords) | |
| 37 else: | |
| 38 seq_2_get[contig_id] = [coords] | |
| 39 | |
| 40 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: | |
| 41 fails = 0 | |
| 42 successes = 0 | |
| 43 records = [] | |
| 44 for contig, listCoords in list(seq_2_get.items()): | |
| 45 contig_seq = records_dict[contig].seq | |
| 46 for coord in listCoords: | |
| 47 coord1 = coord[0] - extra_seq | |
| 48 coord2 = coord[1] + extra_seq | |
| 49 if coord1 < 0 or coord2 > len(contig_seq): | |
| 50 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') | |
| 51 fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n') | |
| 52 fail_log.close() | |
| 53 fails += 1 | |
| 54 else: | |
| 55 geneseq = str(contig_seq[coord1:coord2]) | |
| 56 record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)), | |
| 57 description='') | |
| 58 records.append(record) | |
| 59 successes += 1 | |
| 60 SeqIO.write(records, output_handle, "fasta") | |
| 61 | |
| 62 print('Retrived %s features successfully from %s with %s bp as extra' | |
| 63 ' sequence.' % (str(successes), filename, str(extra_seq))) | |
| 64 if fails > 0: | |
| 65 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) | |
| 66 | |
| 67 | |
| 68 def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir): | |
| 69 # parsing the sequence file into a SeqIO dictionary. one contig per entry | |
| 70 handle = open(fasta_file, "rU") | |
| 71 records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | |
| 72 handle.close() | |
| 73 | |
| 74 with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: | |
| 75 fails = 0 | |
| 76 successes = 0 | |
| 77 records = [] | |
| 78 for locus, location in list(gff_features.items()): | |
| 79 # print locus | |
| 80 contig_seq = records_dict[location[0]].seq | |
| 81 coord1 = location[1] - extra_seq | |
| 82 coord2 = location[2] + extra_seq | |
| 83 if coord1 < 0 or coord2 > len(contig_seq): | |
| 84 fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') | |
| 85 fail_log.write(locus + '\n') | |
| 86 fail_log.close() | |
| 87 fails += 1 | |
| 88 else: | |
| 89 geneseq = str(contig_seq[coord1:coord2]) | |
| 90 if location[3] == '-': | |
| 91 seq = Seq(geneseq) | |
| 92 geneseq = str(seq.reverse_complement()) | |
| 93 record = SeqRecord(Seq(geneseq), | |
| 94 id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' + | |
| 95 str(location[2])), | |
| 96 description='') | |
| 97 records.append(record) | |
| 98 successes += 1 | |
| 99 SeqIO.write(records, output_handle, "fasta") | |
| 100 print('Retrived %s features successfully from %s with %s bp as extra' | |
| 101 ' sequence.' % (str(successes), filename, str(extra_seq))) | |
| 102 if fails > 0: | |
| 103 print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) | |
| 104 | |
| 105 | |
| 106 def parse_features(temp_gff): | |
| 107 # parsing the feature file into a dictionary | |
| 108 gff_features = {} | |
| 109 | |
| 110 with open(temp_gff, 'r') as temp_genes: | |
| 111 for line in temp_genes: | |
| 112 line = line.split('\t') | |
| 113 if "CDS" in line[2]: | |
| 114 id = line[-1].split(';') | |
| 115 locus_id = str(id[0].split('=')[1]) | |
| 116 contig = line[0] | |
| 117 begining = int(line[3]) - 1 # to get the full sequence | |
| 118 end = int(line[4]) | |
| 119 strand = line[6] | |
| 120 location = [contig, begining, end, strand] | |
| 121 gff_features[locus_id] = location | |
| 122 return gff_features | |
| 123 | |
| 124 | |
| 125 def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None): | |
| 126 filename = ntpath.basename(gff_file).replace('.gff', '') | |
| 127 | |
| 128 # cleaning temp files if they exist | |
| 129 if os.path.isfile(output_dir + '/' + filename + '_features.gff'): | |
| 130 os.remove(output_dir + '/' + filename + '_features.gff') | |
| 131 if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'): | |
| 132 os.remove(output_dir + '/' + filename + '_sequence.fasta') | |
| 133 | |
| 134 # cleaning fails file if it exists | |
| 135 if os.path.isfile(output_dir + '/' + filename + '_fails.txt'): | |
| 136 os.remove(output_dir + '/' + filename + '_fails.txt') | |
| 137 | |
| 138 if coord_file is None: | |
| 139 | |
| 140 if ids is not None: | |
| 141 select_ids = parse_id(ids) | |
| 142 else: | |
| 143 select_ids = None | |
| 144 | |
| 145 # separating the gff into 2 different files: one with the features and another with the conting sequences | |
| 146 with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \ | |
| 147 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: | |
| 148 for line in in_handle: | |
| 149 if not line.startswith('##'): | |
| 150 if '\t' in line: | |
| 151 if select_ids is not None: | |
| 152 items = line.split('\t') | |
| 153 id = items[-1].split(';')[0] | |
| 154 id = id.split('=')[1] | |
| 155 if id in select_ids: | |
| 156 temp_genes.write(line) | |
| 157 else: | |
| 158 temp_genes.write(line) | |
| 159 else: | |
| 160 temp_contigs.write(line) | |
| 161 | |
| 162 gff_files = parse_features(output_dir + '/' + filename + '_features.gff') | |
| 163 | |
| 164 retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir) | |
| 165 | |
| 166 else: | |
| 167 with open(gff_file, 'r') as in_handle, \ | |
| 168 open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: | |
| 169 for line in in_handle: | |
| 170 if not line.startswith('##'): | |
| 171 if '\t' in line: | |
| 172 pass | |
| 173 else: | |
| 174 temp_contigs.write(line) | |
| 175 | |
| 176 retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir) | |
| 177 | |
| 178 # removing temp files | |
| 179 if not keep_temporary_files: | |
| 180 try: | |
| 181 os.remove(output_dir + '/' + filename + '_features.gff') | |
| 182 except: | |
| 183 pass | |
| 184 os.remove(output_dir + '/' + filename + '_sequence.fasta') | |
| 185 | |
| 186 | |
| 187 def main(): | |
| 188 parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', | |
| 189 epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') | |
| 190 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | |
| 191 | |
| 192 parser.add_argument('-i', '--input', | |
| 193 help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from' | |
| 194 ' PROKKA).', type=argparse.FileType('r'), required=True) | |
| 195 parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, | |
| 196 required=False) | |
| 197 parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', | |
| 198 action='store_true') | |
| 199 parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', | |
| 200 required=False) | |
| 201 | |
| 202 parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() | |
| 203 parser_optional_selected_regions_exclusive.add_argument('-s', '--select', | |
| 204 help='txt file with the IDs of interest, one per line', | |
| 205 type=argparse.FileType('r'), required=False) | |
| 206 parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', | |
| 207 help='Sequence coordinates to be retrieved. Requires contig' | |
| 208 ' ID and coords (contig,strart,end) in a csv file. One' | |
| 209 ' per line.', type=argparse.FileType('r'), | |
| 210 required=False) | |
| 211 | |
| 212 args = parser.parse_args() | |
| 213 | |
| 214 args.outputDir = os.path.abspath(args.outputDir) | |
| 215 if not os.path.isdir(args.outputDir): | |
| 216 os.makedirs(args.outputDir) | |
| 217 | |
| 218 gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), | |
| 219 args.keepTemporaryFiles, | |
| 220 os.path.abspath(args.select.name) if args.select is not None else None, | |
| 221 os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) | |
| 222 | |
| 223 | |
| 224 if __name__ == "__main__": | |
| 225 main() |
