Mercurial > repos > padge > trimal
diff trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py Fri Mar 25 17:10:43 2022 +0000 @@ -0,0 +1,143 @@ +#!/usr/bin/python +import os +import Bio +import sys +import random +import argparse +import numpy as np +from Bio import SeqIO + +def splitSequence(seq, length = 80): + ''' Split a given sequence contained in one line into lines of size "length" + ''' + return "\n".join([seq[i:i + length] for i in range(0, len(seq), length)]) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \ + str, help = "Input Codon alignment") + + parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \ + str, help = "Set output file") + + parser.add_argument("-s", "--numb_sequences", dest = "numb_sequences", \ + default = 2, type = int, help = "Set how many sequences the output " + + "alignment should contain") + + parser.add_argument("-r", "--numb_residues", dest = "numb_residues", \ + default = 100, type = int, help = "Set how many residues the output " + + "alignment should contain") + + parser.add_argument("-f", "--input_format", dest = "inFormat", type = str, \ + default = "fasta", help = "Set input alignment format") + + parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \ + type = str, help = "Define the gap symbol used in the input/output " + + "alignments") + + parser.add_argument("-m", "--max_attempts", dest = "attempts", default = 10, \ + type = int, help = "Define a maximum numnber of attempts when generating " + + "a random alignment before giving it up") + + args = parser.parse_args() + + ## Check input parameters + if not os.path.isfile(args.inFile): + sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile)) + + if args.numb_sequences < 2: + sys.exit(("ERROR: Check input sequences '%s'") % (str(args.numb_sequences))) + + if args.numb_residues < 2: + sys.exit(("ERROR: Check input residues '%s'") % (str(args.numb_residues))) + + if args.attempts < 1: + sys.exit(("ERROR: Check max. number of attempts '%s'") % (str(args.attempts))) + + ## Read input alignment and get some basic information from it e.g. + ## sequences names, residues number, etc. + algLen = -1 + alignment = {} + for record in SeqIO.parse(args.inFile, args.inFormat): + seq = str(record.seq) + alignment.setdefault(record.id, seq) + if algLen == -1: + algLen = len(seq) + if len(seq) != algLen: + print("Detected Inconsistencies at Sequence's length", file = sys.stderr) + + sequences = list(alignment.keys()) + columns = list(range(algLen)) + + ## Select randomly sequences and columns from the input alignment to populate + ## the output alignment controlling there are not sequences nor columns + ## composed only by gaps. + + ## This is an iterative process + selected_seqs = [] + discarded_seqs = set() + selected_cols = [] + discarded_cols = set() + + ## Set a counter to control how many attempts are done for generating the + ## random alignment + max_attempts = 0 + while True: + + while len(selected_seqs) < args.numb_sequences: + selected = random.choice(sequences) + if not selected in discarded_seqs: + selected_seqs.append(selected) + + while len(selected_cols) < args.numb_residues: + selected = random.choice(columns) + if not selected in discarded_cols: + selected_cols.append(selected) + + generated = {} + for seq in selected_seqs: + if seq in generated: + continue + ## We check generated sequences are not composed only by gaps + sequence = [alignment[seq][pos] for pos in selected_cols] + if set(sequence) - set([args.gapSymbol]) == set([]): + discarded_seqs.add(seq) + continue + generated.setdefault(seq, splitSequence("".join(sequence))) + + ## We have to check there are not columns composed only by gaps + for column in range(len(selected_cols)): + individual_column = [generated[seq][column] for seq in generated] + if set(individual_column) - set([args.gapSymbol]) == set([]): + discarded_cols.add(selected_cols[column]) + + ## We check which sequences/residues remain after controlling by those + ## composed only by gaps + selected_seqs = [s for s in selected_seqs if not s in discarded_seqs] + selected_cols = [c for c in selected_cols if not c in discarded_cols] + + if len(selected_seqs) == args.numb_sequences and \ + len(selected_cols) == args.numb_residues: + break + + max_attempts += 1 + if max_attempts == args.attempts: + sys.exit(("ERROR: Impossible to generate random alignment after '%s' " + + "attempts. Check configuration") % (args.attempts)) + + ## Produce the output aligment. + n = 1 + ofile = open(args.outFile, "w") if args.outFile else sys.stdout + + ## How to properly name output sequences including a padding to have + ## homogeneuous ids + padding = int(np.ceil(np.log10(args.numb_sequences))) + if args.numb_sequences % 10 == 0: + padding += 1 + + for seq in selected_seqs: + print(">seq_%s\n%s" % (str(n).zfill(padding), generated[seq]), file = ofile) + n += 1 + ofile.close()