comparison profrep_masking.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2
3 import argparse
4 from Bio import SeqIO
5 from Bio.Seq import MutableSeq
6 from Bio.Alphabet import generic_dna
7 import sys
8
9
10 def main(args):
11 # Command line arguments
12 QUERY = args.query
13 MODE = args.mode
14 REPEAT_GFF = args.repeat_gff
15 MASKED = args.output_masked
16
17 repeats_all = get_indices(REPEAT_GFF)
18
19 if MODE == "lowercase":
20 lower_mask(QUERY, repeats_all, MASKED)
21 else:
22 N_mask(QUERY, repeats_all, MASKED)
23
24
25 def get_indices(REPEAT_GFF):
26 '''
27 Get indices of repeats from GFF file to mask
28 '''
29 repeats_all = {}
30 with open(REPEAT_GFF, "r") as repeats_gff:
31 for line in repeats_gff:
32 if not line.startswith("#"):
33 seq_id = line.split("\t")[0]
34 start_r = line.split("\t")[3]
35 end_r = line.split("\t")[4]
36 if seq_id in repeats_all.keys():
37 repeats_all[seq_id].append([int(start_r), int(end_r)])
38 else:
39 repeats_all[seq_id] = [[int(start_r), int(end_r)]]
40 return repeats_all
41
42
43 def lower_mask(QUERY, repeats_all, MASKED):
44 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
45 for singleSeq in allSeqs:
46 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
47 for index in repeats_all[singleSeq.id]:
48 for item in range(index[0] - 1, index[1]):
49 mutable[item] = mutable[item].lower()
50 singleSeq.seq = mutable
51 with open(MASKED, "w") as handle:
52 SeqIO.write(allSeqs, handle, 'fasta')
53
54
55 def N_mask(QUERY, repeats_all, MASKED):
56 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
57 for singleSeq in allSeqs:
58 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
59 for index in repeats_all[singleSeq.id]:
60 for item in range(index[0] - 1, index[1]):
61 mutable[item] = "N"
62 singleSeq.seq = mutable
63 with open(MASKED, "w") as handle:
64 SeqIO.write(allSeqs, handle, 'fasta')
65
66
67 if __name__ == "__main__":
68
69 # Command line arguments
70 parser = argparse.ArgumentParser()
71 parser.add_argument('-q',
72 '--query',
73 type=str,
74 required=True,
75 help='query sequence to be processed')
76 parser.add_argument('-rg',
77 '--repeat_gff',
78 type=str,
79 required=True,
80 help='query sequence to be processed')
81 parser.add_argument('-m',
82 '--mode',
83 default="lowercase",
84 choices=['lowercase', 'N'],
85 help='query sequence to be processed')
86 parser.add_argument('-o',
87 '--output_masked',
88 type=str,
89 default="output_masked",
90 help='query sequence to be processed')
91
92 args = parser.parse_args()
93 main(args)