diff cherry_pick_fasta.py @ 1:ea8fde9c6f82 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit f527add7e7bace30b8bc67524ff1da1bf920ec29"
author artbio
date Wed, 09 Oct 2019 18:48:17 -0400
parents e3aee4ba49c6
children 321cad0eb507
line wrap: on
line diff
--- a/cherry_pick_fasta.py	Sun Oct 15 13:26:45 2017 -0400
+++ b/cherry_pick_fasta.py	Wed Oct 09 18:48:17 2019 -0400
@@ -12,27 +12,78 @@
         description="Cherry pick fasta sequences")
     the_parser.add_argument('--input', action="store", type=str,
                             help="input fasta file")
+    the_parser.add_argument('--searchfor', action="store", type=str,
+                            help="with, without, or withlist, withoutlist")
     the_parser.add_argument('--query-string', dest="query_string",
                             action="store", type=str,
-                            help="header containing the string will be\
-                                  extracted as well as the corresponding\
-                                  sequence")
+                            help="headers containing the string will be \
+                                  extracted or excluded as well as the \
+                                  corresponding sequence")
+    the_parser.add_argument('--query-file', dest="query_file",
+                            action="store", type=str,
+                            help="headers containing any of the strings provided in the \
+                                  text file (1 string per line) will be \
+                                  extracted or excluded as well as the \
+                                  corresponding sequence")
+
     the_parser.add_argument(
         '--output', action="store", type=str, help="output fasta file")
     args = the_parser.parse_args()
     return args
 
 
+def parse_fasta_with(query, FastaListe):
+    if not isinstance(query, list):
+        query = [query]
+    accumulator = []
+    for sequence in FastaListe:
+        for string in query:
+            if string in sequence:
+                accumulator.append(sequence)
+                continue
+    return accumulator
+
+
+def complement_fasta(fullfasta, subfasta):
+    return list(set(fullfasta) - set(subfasta))
+
+
+def getquerylist(file):
+    querylist = []
+    for line in open(file, 'r'):
+        querylist.append(line.rstrip())
+    return querylist
+
+
 def __main__():
     """ main function """
     args = Parser()
-    search_term = args.query_string
+    searchterm = args.query_string
     CrudeFasta = open(args.input, "r").read()
     Output = open(args.output, "w")
-    FastaListe = CrudeFasta.split(">")
-    for sequence in FastaListe:
-        if search_term in sequence:
-            Output.write(">%s\n" % sequence.rstrip())
+    FastaListe = CrudeFasta.split(">")[1:]
+    if args.query_string:
+        if args.searchfor == 'with':
+            contList = parse_fasta_with(searchterm, FastaListe)
+            contFasta = ">%s" % ">".join(contList)
+            Output.write(contFasta)
+        elif args.searchfor == 'without':
+            notcontList = complement_fasta(FastaListe,
+                                           parse_fasta_with(searchterm,
+                                                            FastaListe))
+            notcontFasta = ">%s" % ">".join(notcontList)
+            Output.write(notcontFasta)
+    if args.query_file:
+        searchlist = getquerylist(args.query_file)
+        if args.searchfor == 'with':
+            contList = parse_fasta_with(searchlist, FastaListe)
+            contFasta = ">%s" % ">".join(contList)
+            Output.write(contFasta)
+        elif args.searchfor == 'without':
+            notcontList = complement_fasta(FastaListe, parse_fasta_with(
+                                           searchlist, FastaListe))
+            notcontFasta = ">%s" % ">".join(notcontList)
+            Output.write(notcontFasta)
     Output.close()