annotate trimal_repo/scripts/get_sequences_gaps_ratio.py @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
1 #!/usr/bin/python3
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
2
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
3 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
4 # 'get_sequneces_gaps.py'
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
5 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
6 # Script implemented to obtain the sequences index for those seuqneces
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
7 # exceding a minimum gaps' ratio threshold.
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
8 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
9 # [2020] S. Capella-Gutierrez - salvador.capella@bsc.es
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
10 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
11 # this script is free software: you can redistribute it and/or modify it under
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
12 # the terms of the GNU General Public License as published by the Free
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
13 # Software Foundation, the last available version.
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
14 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
15 # this script is distributed in the hope that it will be useful, but WITHOUT
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
16 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
17 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
18 # more details on <http://www.gnu.org/licenses/>
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
19 #
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
20 from Bio import AlignIO
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
21 import argparse
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
22 import sys
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
23 import os
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
24
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
25 if __name__ == "__main__":
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
26
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
27 parser = argparse.ArgumentParser()
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
28
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
29 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
30 str, help = "Input alignment")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
31
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
32 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
33 str, help = "Set output file. It will be generated into FASTA format")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
34
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
35 parser.add_argument("-f", "--format", dest = "inFormat", default = "fasta", \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
36 type = str, choices = ["clustal", "fasta-m10", "fasta", "phylip-relaxed", \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
37 "phylip-sequential", "phylip", "nexus"],help = "Set input alignment format")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
38
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
39 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
40 type = str, help = "Define the gap symbol used in the input alignment")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
41
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
42 parser.add_argument("--show_only_index", dest = "showIndexes", default = False, \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
43 action = "store_true", help = "Show only the indexes of sequences with a "
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
44 + "gaps' ratio equal or higher than the established threshold")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
45
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
46 parser.add_argument("--threshold", dest = "gapsThreshold", default = 0.0, \
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
47 type = float, help = "Identify sequences with a minimum of gaps' ratio")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
48
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
49 parser.add_argument("--keep_header", dest = "keepHeader", default = False,
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
50 action = "store_true", help = "Keep original alignment sequence IDs indepen"
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
51 + "dently of blank spaces on it")
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
52
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
53 args = parser.parse_args()
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
54
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
55 if not os.path.isfile(args.inFile):
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
56 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile))
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
57
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
58 index = 0
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
59 indexes = []
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
60 ofile = open(args.outFile, "w") if args.outFile else sys.stdout
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
61 for record in AlignIO.read(args.inFile, format = args.inFormat):
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
62 sequence_id = record.id if not args.keepHeader else record.description
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
63 sequence = str(record.seq)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
64
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
65 length = len(sequence)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
66 valid = len([ps for ps in range(length) if sequence[ps] != args.gapSymbol])
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
67 gaps_ratio = 1 - (valid/length)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
68
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
69 if gaps_ratio >= args.gapsThreshold:
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
70 if not args.showIndexes:
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
71 print(f'{index:4d}\t{sequence_id:30}\t{gaps_ratio:.4f}', file = ofile)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
72 indexes.append(index)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
73 index += 1
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
74
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
75 if args.showIndexes:
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
76 print (','.join(map(str, indexes)), file = ofile)
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
77
b15a3147e604 "planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff changeset
78 ofile.close()