comparison cherry_pick_fasta.py @ 3:c282a8a47dd9 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit d637de6c1090314bd34bdffc2fdf979cb55b870b"
author artbio
date Fri, 21 May 2021 09:34:14 +0000
parents 321cad0eb507
children ba6c4aeb22ea
comparison
equal deleted inserted replaced
2:321cad0eb507 3:c282a8a47dd9
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 """ 3 # Chery pick of fasta sequences satisfying a query string in their header/name
4 Chery pick of fasta sequences satisfying a query string in their header/name 4 import argparse
5 """
6 5
7 import argparse 6 from Bio import SeqIO
8 7
9 8
10 def Parser(): 9 def Parser():
11 the_parser = argparse.ArgumentParser( 10 the_parser = argparse.ArgumentParser(
12 description="Cherry pick fasta sequences") 11 description='Cherry pick fasta sequences')
13 the_parser.add_argument('--input', action="store", type=str, 12 the_parser.add_argument('--input', action='store', type=str,
14 help="input fasta file") 13 help='input fasta file')
15 the_parser.add_argument('--searchfor', action="store", type=str, 14 the_parser.add_argument('--searchfor', action='store', type=str,
16 help="with, without, or withlist, withoutlist") 15 help='with, without, or withlist, withoutlist')
17 the_parser.add_argument('--query-string', dest="query_string", 16 the_parser.add_argument('--mode', action='store', type=str,
18 action="store", type=str, 17 default='includes', help='exact or includes')
19 help="headers containing the string will be \ 18 the_parser.add_argument('--query-string', dest='query_string',
19 action='store', type=str,
20 help='headers containing the string will be \
20 extracted or excluded as well as the \ 21 extracted or excluded as well as the \
21 corresponding sequence") 22 corresponding sequence')
22 the_parser.add_argument('--query-file', dest="query_file", 23 the_parser.add_argument('--query-file', dest='query_file',
23 action="store", type=str, 24 action='store', type=str,
24 help="headers containing any of the strings provided in the \ 25 help='headers containing any of the strings \
25 text file (1 string per line) will be \ 26 provided in the text file (1 string per \
26 extracted or excluded as well as the \ 27 line) will be extracted or excluded as well \
27 corresponding sequence") 28 as the corresponding sequence')
28 29 the_parser.add_argument('--output', action='store', type=str,
29 the_parser.add_argument( 30 help='output fasta file')
30 '--output', action="store", type=str, help="output fasta file")
31 args = the_parser.parse_args() 31 args = the_parser.parse_args()
32 return args 32 return args
33 33
34 34
35 def parse_fasta_with(query, FastaListe): 35 def parse_fasta_dict(query, fasta_dict, mode):
36 if not isinstance(query, list): 36 if not isinstance(query, list):
37 query = [query] 37 query = [query]
38 accumulator = [] 38 accumulator = []
39 for sequence in FastaListe: 39 if mode == 'includes':
40 for string in query: 40 for seq_id in fasta_dict:
41 if string in sequence: 41 for string in query:
42 accumulator.append(sequence) 42 if string in seq_id:
43 continue 43 accumulator.append(seq_id)
44 return accumulator 44 continue
45 elif mode == 'exact':
46 for seq_id in fasta_dict:
47 for string in query:
48 if string == seq_id:
49 accumulator.append(seq_id)
50 continue
51 res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
52 return res_dict
45 53
46 54
47 def complement_fasta(fullfasta, subfasta): 55 def complement_fasta_dict(fasta_dict, subfasta_dict):
48 return sorted(list(set(fullfasta) - set(subfasta))) 56 fasta_ids = list(fasta_dict.keys())
57 subfasta_ids = list(subfasta_dict.keys())
58 complement_ids = list(set(fasta_ids) - set(subfasta_ids))
59 sub_dict = {k: fasta_dict[k] for k in fasta_dict if k in complement_ids}
60 return sub_dict
49 61
50 62
51 def getquerylist(file): 63 def getquerylist(file):
52 querylist = [] 64 querylist = []
53 for line in open(file, 'r'): 65 for line in open(file, 'r'):
54 querylist.append(line.rstrip()) 66 querylist.append(line.rstrip())
55 return querylist 67 return querylist
56 68
57 69
58 def __main__(): 70 def buid_fasta_dict(fasta):
59 """ main function """ 71 seq_dict = {rec.id: rec.seq for rec in SeqIO.parse(fasta, "fasta")}
60 args = Parser() 72 return seq_dict
61 searchterm = args.query_string
62 CrudeFasta = open(args.input, "r").read()
63 Output = open(args.output, "w")
64 FastaListe = CrudeFasta.split(">")[1:]
65 if args.query_string:
66 if args.searchfor == 'with':
67 contList = parse_fasta_with(searchterm, FastaListe)
68 contFasta = ">%s" % ">".join(contList)
69 Output.write(contFasta)
70 elif args.searchfor == 'without':
71 notcontList = complement_fasta(FastaListe,
72 parse_fasta_with(searchterm,
73 FastaListe))
74 notcontFasta = ">%s" % ">".join(notcontList)
75 Output.write(notcontFasta)
76 if args.query_file:
77 searchlist = getquerylist(args.query_file)
78 if args.searchfor == 'with':
79 contList = parse_fasta_with(searchlist, FastaListe)
80 contFasta = ">%s" % ">".join(contList)
81 Output.write(contFasta)
82 elif args.searchfor == 'without':
83 notcontList = complement_fasta(FastaListe, parse_fasta_with(
84 searchlist, FastaListe))
85 notcontFasta = ">%s" % ">".join(notcontList)
86 Output.write(notcontFasta)
87 Output.close()
88 73
89 74
90 if __name__ == "__main__": 75 def write_fasta_result(fasta_dict, file):
76 line_length = 60
77 with open(file, 'w') as f:
78 for header in sorted(fasta_dict):
79 f.write('>%s\n' % header)
80 for i in range(line_length, len(fasta_dict[header]), line_length):
81 f.write('%s\n' % fasta_dict[header][i-line_length:i])
82 f.write('%s\n' % fasta_dict[header][i:])
83
84
85 def __main__():
86 ''' main function '''
87 args = Parser()
88 fasta_dict = buid_fasta_dict(args.input)
89 if args.query_string:
90 query = args.query_string
91 elif args.query_file:
92 query = getquerylist(args.query_file)
93 if args.searchfor == 'with':
94 fasta_result_dict = parse_fasta_dict(query, fasta_dict, args.mode)
95 elif args.searchfor == 'without':
96 fasta_result_dict = complement_fasta_dict(fasta_dict, parse_fasta_dict(
97 query, fasta_dict,
98 args.mode))
99 write_fasta_result(fasta_result_dict, args.output)
100
101
102 if __name__ == '__main__':
91 __main__() 103 __main__()