comparison trimal_repo/scripts/get_sequences_gaps_ratio.py @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b15a3147e604
1 #!/usr/bin/python3
2
3 #
4 # 'get_sequneces_gaps.py'
5 #
6 # Script implemented to obtain the sequences index for those seuqneces
7 # exceding a minimum gaps' ratio threshold.
8 #
9 # [2020] S. Capella-Gutierrez - salvador.capella@bsc.es
10 #
11 # this script is free software: you can redistribute it and/or modify it under
12 # the terms of the GNU General Public License as published by the Free
13 # Software Foundation, the last available version.
14 #
15 # this script is distributed in the hope that it will be useful, but WITHOUT
16 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 # more details on <http://www.gnu.org/licenses/>
19 #
20 from Bio import AlignIO
21 import argparse
22 import sys
23 import os
24
25 if __name__ == "__main__":
26
27 parser = argparse.ArgumentParser()
28
29 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \
30 str, help = "Input alignment")
31
32 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \
33 str, help = "Set output file. It will be generated into FASTA format")
34
35 parser.add_argument("-f", "--format", dest = "inFormat", default = "fasta", \
36 type = str, choices = ["clustal", "fasta-m10", "fasta", "phylip-relaxed", \
37 "phylip-sequential", "phylip", "nexus"],help = "Set input alignment format")
38
39 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \
40 type = str, help = "Define the gap symbol used in the input alignment")
41
42 parser.add_argument("--show_only_index", dest = "showIndexes", default = False, \
43 action = "store_true", help = "Show only the indexes of sequences with a "
44 + "gaps' ratio equal or higher than the established threshold")
45
46 parser.add_argument("--threshold", dest = "gapsThreshold", default = 0.0, \
47 type = float, help = "Identify sequences with a minimum of gaps' ratio")
48
49 parser.add_argument("--keep_header", dest = "keepHeader", default = False,
50 action = "store_true", help = "Keep original alignment sequence IDs indepen"
51 + "dently of blank spaces on it")
52
53 args = parser.parse_args()
54
55 if not os.path.isfile(args.inFile):
56 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile))
57
58 index = 0
59 indexes = []
60 ofile = open(args.outFile, "w") if args.outFile else sys.stdout
61 for record in AlignIO.read(args.inFile, format = args.inFormat):
62 sequence_id = record.id if not args.keepHeader else record.description
63 sequence = str(record.seq)
64
65 length = len(sequence)
66 valid = len([ps for ps in range(length) if sequence[ps] != args.gapSymbol])
67 gaps_ratio = 1 - (valid/length)
68
69 if gaps_ratio >= args.gapsThreshold:
70 if not args.showIndexes:
71 print(f'{index:4d}\t{sequence_id:30}\t{gaps_ratio:.4f}', file = ofile)
72 indexes.append(index)
73 index += 1
74
75 if args.showIndexes:
76 print (','.join(map(str, indexes)), file = ofile)
77
78 ofile.close()