diff trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py	Fri Mar 25 17:10:43 2022 +0000
@@ -0,0 +1,143 @@
+#!/usr/bin/python
+import os
+import Bio
+import sys
+import random
+import argparse
+import numpy as np
+from Bio import SeqIO
+
+def splitSequence(seq, length = 80):
+  ''' Split a given sequence contained in one line into lines of size "length"
+  '''
+  return "\n".join([seq[i:i + length] for i in range(0, len(seq), length)])
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser()
+
+  parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \
+    str, help = "Input Codon alignment")
+
+  parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \
+    str, help = "Set output file")
+
+  parser.add_argument("-s", "--numb_sequences", dest = "numb_sequences", \
+    default = 2, type = int, help = "Set how many sequences the output "
+    + "alignment should contain")
+
+  parser.add_argument("-r", "--numb_residues", dest = "numb_residues", \
+    default = 100, type = int, help = "Set how many residues the output "
+    + "alignment should contain")
+
+  parser.add_argument("-f", "--input_format", dest = "inFormat", type = str, \
+    default = "fasta", help = "Set input alignment format")
+
+  parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \
+    type = str, help = "Define the gap symbol used in the input/output "
+    + "alignments")
+
+  parser.add_argument("-m", "--max_attempts", dest = "attempts", default = 10, \
+    type = int, help = "Define a maximum numnber of attempts when generating "
+    + "a random alignment before giving it up")
+
+  args = parser.parse_args()
+
+  ## Check input parameters
+  if not os.path.isfile(args.inFile):
+    sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile))
+
+  if args.numb_sequences < 2:
+    sys.exit(("ERROR: Check input sequences '%s'") % (str(args.numb_sequences)))
+
+  if args.numb_residues < 2:
+    sys.exit(("ERROR: Check input residues '%s'") % (str(args.numb_residues)))
+
+  if args.attempts < 1:
+    sys.exit(("ERROR: Check max. number of attempts '%s'") % (str(args.attempts)))
+
+  ## Read input alignment and get some basic information from it e.g.
+  ## sequences names, residues number, etc.
+  algLen = -1
+  alignment = {}
+  for record in SeqIO.parse(args.inFile, args.inFormat):
+    seq = str(record.seq)
+    alignment.setdefault(record.id, seq)
+    if algLen == -1:
+      algLen = len(seq)
+    if len(seq) != algLen:
+      print("Detected Inconsistencies at Sequence's length", file = sys.stderr)
+
+  sequences = list(alignment.keys())
+  columns = list(range(algLen))
+
+  ## Select randomly sequences and columns from the input alignment to populate
+  ## the output alignment controlling there are not sequences nor columns
+  ## composed only by gaps.
+  
+  ## This is an iterative process
+  selected_seqs = []
+  discarded_seqs = set()
+  selected_cols = []
+  discarded_cols = set()
+
+  ## Set a counter to control how many attempts are done for generating the
+  ## random alignment
+  max_attempts = 0
+  while True:
+
+    while len(selected_seqs) < args.numb_sequences:
+      selected = random.choice(sequences)
+      if not selected in discarded_seqs:
+        selected_seqs.append(selected)
+
+    while len(selected_cols) < args.numb_residues:
+      selected = random.choice(columns)
+      if not selected in discarded_cols:
+        selected_cols.append(selected)
+
+    generated = {}
+    for seq in selected_seqs:
+      if seq in generated:
+        continue
+      ## We check generated sequences are not composed only by gaps
+      sequence = [alignment[seq][pos] for pos in selected_cols]
+      if set(sequence) - set([args.gapSymbol]) == set([]):
+        discarded_seqs.add(seq)
+        continue
+      generated.setdefault(seq, splitSequence("".join(sequence)))
+
+    ## We have to check there are not columns composed only by gaps
+    for column in range(len(selected_cols)):
+      individual_column = [generated[seq][column] for seq in generated]
+      if set(individual_column) - set([args.gapSymbol]) == set([]):
+        discarded_cols.add(selected_cols[column])
+      
+    ## We check which sequences/residues remain after controlling by those
+    ## composed only by gaps
+    selected_seqs = [s for s in selected_seqs if not s in discarded_seqs]
+    selected_cols = [c for c in selected_cols if not c in discarded_cols]
+
+    if len(selected_seqs) == args.numb_sequences and \
+      len(selected_cols) == args.numb_residues:
+      break
+
+    max_attempts += 1
+    if max_attempts == args.attempts:
+      sys.exit(("ERROR: Impossible to generate random alignment after '%s' "
+        + "attempts. Check configuration") % (args.attempts))
+  
+  ## Produce the output aligment.
+  n = 1
+  ofile = open(args.outFile, "w") if args.outFile else sys.stdout
+
+  ## How to properly name output sequences including a padding to have
+  ## homogeneuous ids
+  padding = int(np.ceil(np.log10(args.numb_sequences)))
+  if args.numb_sequences % 10 == 0:
+    padding += 1
+ 
+  for seq in selected_seqs:
+    print(">seq_%s\n%s" % (str(n).zfill(padding), generated[seq]), file = ofile)
+    n += 1
+  ofile.close()