Mercurial > repos > padge > trimal
annotate trimal_repo/scripts/get_sequences_gaps_ratio.py @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
rev | line source |
---|---|
0
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
2 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
3 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
4 # 'get_sequneces_gaps.py' |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
5 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
6 # Script implemented to obtain the sequences index for those seuqneces |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
7 # exceding a minimum gaps' ratio threshold. |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
8 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
9 # [2020] S. Capella-Gutierrez - salvador.capella@bsc.es |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
10 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
11 # this script is free software: you can redistribute it and/or modify it under |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
12 # the terms of the GNU General Public License as published by the Free |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
13 # Software Foundation, the last available version. |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
14 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
15 # this script is distributed in the hope that it will be useful, but WITHOUT |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
16 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
17 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
18 # more details on <http://www.gnu.org/licenses/> |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
19 # |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
20 from Bio import AlignIO |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
21 import argparse |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
22 import sys |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
23 import os |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
24 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
25 if __name__ == "__main__": |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
26 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
27 parser = argparse.ArgumentParser() |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
28 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
29 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
30 str, help = "Input alignment") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
31 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
32 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
33 str, help = "Set output file. It will be generated into FASTA format") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
34 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
35 parser.add_argument("-f", "--format", dest = "inFormat", default = "fasta", \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
36 type = str, choices = ["clustal", "fasta-m10", "fasta", "phylip-relaxed", \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
37 "phylip-sequential", "phylip", "nexus"],help = "Set input alignment format") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
38 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
39 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
40 type = str, help = "Define the gap symbol used in the input alignment") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
41 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
42 parser.add_argument("--show_only_index", dest = "showIndexes", default = False, \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
43 action = "store_true", help = "Show only the indexes of sequences with a " |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
44 + "gaps' ratio equal or higher than the established threshold") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
45 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
46 parser.add_argument("--threshold", dest = "gapsThreshold", default = 0.0, \ |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
47 type = float, help = "Identify sequences with a minimum of gaps' ratio") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
48 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
49 parser.add_argument("--keep_header", dest = "keepHeader", default = False, |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
50 action = "store_true", help = "Keep original alignment sequence IDs indepen" |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
51 + "dently of blank spaces on it") |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
52 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
53 args = parser.parse_args() |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
54 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
55 if not os.path.isfile(args.inFile): |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
56 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile)) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
57 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
58 index = 0 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
59 indexes = [] |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
60 ofile = open(args.outFile, "w") if args.outFile else sys.stdout |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
61 for record in AlignIO.read(args.inFile, format = args.inFormat): |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
62 sequence_id = record.id if not args.keepHeader else record.description |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
63 sequence = str(record.seq) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
64 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
65 length = len(sequence) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
66 valid = len([ps for ps in range(length) if sequence[ps] != args.gapSymbol]) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
67 gaps_ratio = 1 - (valid/length) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
68 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
69 if gaps_ratio >= args.gapsThreshold: |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
70 if not args.showIndexes: |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
71 print(f'{index:4d}\t{sequence_id:30}\t{gaps_ratio:.4f}', file = ofile) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
72 indexes.append(index) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
73 index += 1 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
74 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
75 if args.showIndexes: |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
76 print (','.join(map(str, indexes)), file = ofile) |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
77 |
b15a3147e604
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
padge
parents:
diff
changeset
|
78 ofile.close() |