Mercurial > repos > padge > trimal
comparison trimal_repo/scripts/get_sequence_representative_from_alignment.py @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b15a3147e604 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 # | |
4 # 'get_sequence_representative_from_alignment.py' | |
5 # | |
6 # Script implemented to work with trimAl to analyze gaps statistics and decide | |
7 # which are the boundaries in a given alignment - columns inbetween these | |
8 # boundaries will not be removed independently of the trimming strategy | |
9 # selected. | |
10 # | |
11 # [2014] S. Capella-Gutierrez - scapella@crg.es | |
12 # | |
13 # this script is free software: you can redistribute it and/or modify it under | |
14 # the terms of the GNU General Public License as published by the Free | |
15 # Software Foundation, the last available version. | |
16 # | |
17 # this script is distributed in the hope that it will be useful, but WITHOUT | |
18 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
20 # more details on <http://www.gnu.org/licenses/> | |
21 # | |
22 from Bio import AlignIO | |
23 import numpy as np | |
24 import argparse | |
25 import sys | |
26 import os | |
27 | |
28 if __name__ == "__main__": | |
29 | |
30 parser = argparse.ArgumentParser() | |
31 | |
32 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \ | |
33 str, help = "Input alignment") | |
34 | |
35 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \ | |
36 str, help = "Set output file") | |
37 | |
38 parser.add_argument("-f", "--format", dest = "inFormat", default = "fasta", \ | |
39 type = str, choices = ["clustal", "fasta-m10", "fasta", "phylip-relaxed", \ | |
40 "phylip-sequential", "phylip", "nexus"],help = "Set input alignment format") | |
41 | |
42 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \ | |
43 type = str, help = "Define the gap symbol used in the input alignment") | |
44 | |
45 parser.add_argument("--keep_header", dest = "keepHeader", default = False, | |
46 action = "store_true", help = "Keep original alignment sequence IDs indepen" | |
47 + "dently of blank spaces on it") | |
48 | |
49 parser.add_argument("-v", "--verbose", dest = "verbose", default = False, | |
50 action = "store_true", help = "Activate verbosity") | |
51 | |
52 args = parser.parse_args() | |
53 | |
54 if not os.path.isfile(args.inFile): | |
55 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile)) | |
56 | |
57 identities, sequences = {}, {} | |
58 for record in AlignIO.read(args.inFile, format = args.inFormat): | |
59 current_seq = str(record.seq) | |
60 sequence_length = len(current_seq) | |
61 sequence_id = record.id if not args.keepHeader else record.description | |
62 | |
63 for seq in sequences: | |
64 ## Identity score is computed considering all positions for which at least | |
65 ## one of the sequences has a non-gap symbol | |
66 valid_pos = [ pos for pos in range(sequence_length) if current_seq[pos] \ | |
67 != args.gapSymbol or sequences[seq][0][pos] == args.gapSymbol ] | |
68 | |
69 identical = [ pos for pos in valid_pos if sequences[seq][0][pos] == \ | |
70 current_seq[pos]] | |
71 | |
72 ratio = float(len(identical))/len(valid_pos) | |
73 identities.setdefault(sequence_id, {}).setdefault(seq, ratio) | |
74 identities.setdefault(seq, {}).setdefault(sequence_id, ratio) | |
75 | |
76 ## Save current sequence and move on to the nex one | |
77 ungapped = current_seq.replace(args.gapSymbol, "") | |
78 sequences.setdefault(sequence_id, [current_seq, ungapped, len(ungapped)]) | |
79 | |
80 selection, maxIdentity = set(), 0 | |
81 for refer in sequences: | |
82 avg = np.average([identities[refer][seq] for seq in identities[refer]]) | |
83 if args.verbose: | |
84 print >> sys.stderr, ("%-20s\t%.6f") % (refer, avg) | |
85 ## Save current sequence if it has a greater identity score | |
86 if avg > maxIdentity: | |
87 maxIdentity = avg | |
88 selection = set([(sequences[refer][1], refer)]) | |
89 elif avg == maxIdentity: | |
90 selection |= set([(sequences[refer][1], refer)]) | |
91 | |
92 representative = sorted(selection, reverse = True)[0][1] | |
93 ofile = open(args.outFile, "w") if args.outFile else sys.stdout | |
94 print >> ofile, (">%s\n%s") % (representative, sequences[representative][1]) | |
95 ofile.close() |