0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import argparse
|
|
4 from Bio import SeqIO
|
|
5 from Bio.Seq import MutableSeq
|
|
6 from Bio.Alphabet import generic_dna
|
|
7 import sys
|
|
8
|
|
9
|
|
10 def main(args):
|
|
11 # Command line arguments
|
|
12 QUERY = args.query
|
|
13 MODE = args.mode
|
|
14 REPEAT_GFF = args.repeat_gff
|
|
15 MASKED = args.output_masked
|
|
16
|
|
17 repeats_all = get_indices(REPEAT_GFF)
|
|
18
|
|
19 if MODE == "lowercase":
|
|
20 lower_mask(QUERY, repeats_all, MASKED)
|
|
21 else:
|
|
22 N_mask(QUERY, repeats_all, MASKED)
|
|
23
|
|
24
|
|
25 def get_indices(REPEAT_GFF):
|
|
26 '''
|
|
27 Get indices of repeats from GFF file to mask
|
|
28 '''
|
|
29 repeats_all = {}
|
|
30 with open(REPEAT_GFF, "r") as repeats_gff:
|
|
31 for line in repeats_gff:
|
|
32 if not line.startswith("#"):
|
|
33 seq_id = line.split("\t")[0]
|
|
34 start_r = line.split("\t")[3]
|
|
35 end_r = line.split("\t")[4]
|
|
36 if seq_id in repeats_all.keys():
|
|
37 repeats_all[seq_id].append([int(start_r), int(end_r)])
|
|
38 else:
|
|
39 repeats_all[seq_id] = [[int(start_r), int(end_r)]]
|
|
40 return repeats_all
|
|
41
|
|
42
|
|
43 def lower_mask(QUERY, repeats_all, MASKED):
|
|
44 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
|
|
45 for singleSeq in allSeqs:
|
|
46 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
|
|
47 for index in repeats_all[singleSeq.id]:
|
|
48 for item in range(index[0] - 1, index[1]):
|
|
49 mutable[item] = mutable[item].lower()
|
|
50 singleSeq.seq = mutable
|
|
51 with open(MASKED, "w") as handle:
|
|
52 SeqIO.write(allSeqs, handle, 'fasta')
|
|
53
|
|
54
|
|
55 def N_mask(QUERY, repeats_all, MASKED):
|
|
56 allSeqs = list(SeqIO.parse(QUERY, 'fasta'))
|
|
57 for singleSeq in allSeqs:
|
|
58 mutable = MutableSeq(str(singleSeq.seq), generic_dna)
|
|
59 for index in repeats_all[singleSeq.id]:
|
|
60 for item in range(index[0] - 1, index[1]):
|
|
61 mutable[item] = "N"
|
|
62 singleSeq.seq = mutable
|
|
63 with open(MASKED, "w") as handle:
|
|
64 SeqIO.write(allSeqs, handle, 'fasta')
|
|
65
|
|
66
|
|
67 if __name__ == "__main__":
|
|
68
|
|
69 # Command line arguments
|
|
70 parser = argparse.ArgumentParser()
|
|
71 parser.add_argument('-q',
|
|
72 '--query',
|
|
73 type=str,
|
|
74 required=True,
|
|
75 help='query sequence to be processed')
|
|
76 parser.add_argument('-rg',
|
|
77 '--repeat_gff',
|
|
78 type=str,
|
|
79 required=True,
|
|
80 help='query sequence to be processed')
|
|
81 parser.add_argument('-m',
|
|
82 '--mode',
|
|
83 default="lowercase",
|
|
84 choices=['lowercase', 'N'],
|
|
85 help='query sequence to be processed')
|
|
86 parser.add_argument('-o',
|
|
87 '--output_masked',
|
|
88 type=str,
|
|
89 default="output_masked",
|
|
90 help='query sequence to be processed')
|
|
91
|
|
92 args = parser.parse_args()
|
|
93 main(args)
|