comparison fasta_tabular_converter.py @ 3:36388b666bfc draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit b6de14061c479f0418cd89e26d6f5ac26e565a07
author drosofff
date Wed, 09 Nov 2016 11:24:13 -0500
parents 330dd8a8c31a
children
comparison
equal deleted inserted replaced
2:330dd8a8c31a 3:36388b666bfc
1 #!/usr/bin/python 1 #!/usr/bin/env python
2 # 2 #
3 import argparse
4 import logging
3 import sys 5 import sys
4 import string
5 import argparse
6 from collections import defaultdict 6 from collections import defaultdict
7
7 8
8 def Parser(): 9 def Parser():
9 the_parser = argparse.ArgumentParser() 10 the_parser = argparse.ArgumentParser()
10 the_parser.add_argument( 11 the_parser.add_argument(
11 '--input', action="store", type=str, help="input file") 12 '--input', action="store", type=str, help="input file")
14 the_parser.add_argument( 15 the_parser.add_argument(
15 '--type', action="store", type=str, help="type of convertion") 16 '--type', action="store", type=str, help="type of convertion")
16 args = the_parser.parse_args() 17 args = the_parser.parse_args()
17 return args 18 return args
18 19
20
19 def readfasta_writetabular(fasta, tabular, mode="oneline"): 21 def readfasta_writetabular(fasta, tabular, mode="oneline"):
20 F = open(fasta, "r") 22 for line in fasta:
21 for line in F:
22 if line[0] == ">": 23 if line[0] == ">":
23 try: 24 try:
24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable 25 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable
25 except: pass 26 except NameError:
26 stringlist=[] 27 pass
28 stringlist = []
27 else: 29 else:
28 stringlist.append(line[:-1]) 30 try:
31 stringlist.append(line[:-1])
32 except UnboundLocalError: # if file went through filter and contains only empty lines
33 logging.info("first line is empty.")
29 try: 34 try:
30 seqdic["".join(stringlist)] += 1 # for the last sequence 35 seqdic["".join(stringlist)] += 1 # for the last sequence
31 except: pass # in case file to convert is empty 36 except NameError:
32 F.close() 37 logging.info("input file has not fasta sequences.")
33 F = open(tabular, "w")
34 for seq in sorted(seqdic, key=seqdic.get, reverse=True): 38 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
35 print >> F, "%s\t%s" % (seq, seqdic[seq]) 39 tabular.write("%s\t%s\n" % (seq, seqdic[seq]))
36 F.close() 40
37 41
38
39 def readtabular_writefasta(tabular, fasta): 42 def readtabular_writefasta(tabular, fasta):
40 F = open(tabular, "r") 43 counter = 0
41 Fw = open(fasta, "w") 44 for line in tabular:
42 counter = 0 45 fields = line.split()
43 for line in F: 46 for i in range(int(fields[1])):
44 fields = line.split() 47 counter += 1
45 for i in range(int(fields[1])): 48 fasta.write(">%s\n%s\n" % (counter, fields[0]))
46 counter += 1
47 print >> Fw, ">%s\n%s" % (counter, fields[0])
48 F.close()
49 Fw.close()
50 49
51 def readtabular_writefastaweighted (tabular, fasta):
52 F = open(tabular, "r")
53 Fw = open(fasta, "w")
54 counter = 0
55 for line in F:
56 counter += 1
57 fields = line[:-1].split()
58 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0])
59 F.close()
60 Fw.close()
61 50
62 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): 51 def readtabular_writefastaweighted(tabular, fasta):
63 F = open(fastaweigthed_input, "r") 52 counter = 0
64 number_reads = 0 53 for line in tabular:
65 for line in F: 54 counter += 1
66 if line[0] == ">": 55 fields = line[:-1].split()
67 weigth = int(line[1:-1].split("_")[-1]) 56 fasta.write(">%s_%s\n%s\n" % (counter, fields[1], fields[0]))
68 number_reads += weigth
69 else:
70 seqdic[line[:-1]] += weigth
71 F.close()
72 F = open(fastaweigthed_reparsed, "w")
73 n=0
74 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
75 n += 1
76 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq)
77 F.close()
78 print "%s reads collapsed" % number_reads
79 57
80 def readfastaeighted_writefasta(fastaweigthed, fasta): 58
81 F = open(fastaweigthed, "r") 59 def readfastaweighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed):
82 Fw = open(fasta, "w") 60 number_reads = 0
83 counter = 0 61 for line in fastaweigthed_input:
84 for line in F: 62 if line[0] == ">":
85 if line[0] == ">": 63 weigth = int(line[1:-1].split("_")[-1])
86 weigth = int(line[1:-1].split("_")[-1]) 64 number_reads += weigth
87 else: 65 else:
88 seq = line[:-1] 66 seqdic[line[:-1]] += weigth
89 for i in range (weigth): 67 n = 0
90 counter += 1 68 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
91 print >> Fw, ">%s\n%s" % (counter, seq) 69 n += 1
92 F.close() 70 fastaweigthed_reparsed.write(">%s_%s\n%s\n" % (n, seqdic[seq], seq))
93 Fw.close() 71 log.info("%s reads collapsed" % number_reads)
72
73
74 def readfastaweighted_writefasta(fastaweigthed, fasta):
75 counter = 0
76 for line in fastaweigthed:
77 if line[0] == ">":
78 weigth = int(line[1:-1].split("_")[-1])
79 else:
80 seq = line[:-1]
81 for i in range(weigth):
82 counter += 1
83 fasta.write(">%s\n%s\n" % (counter, seq))
84
94 85
95 def main(input, output, type): 86 def main(input, output, type):
96 if type == "fasta2tabular": 87 with open(input, "r") as input:
97 readfasta_writetabular(input, output) 88 with open(output, "w") as output:
98 elif type == "tabular2fasta": 89 if type == "fasta2tabular":
99 readtabular_writefasta(input, output) 90 readfasta_writetabular(input, output)
100 elif type == "tabular2fastaweight": 91 elif type == "tabular2fasta":
101 readtabular_writefastaweighted (input, output) 92 readtabular_writefasta(input, output)
102 elif type == "fastaweight2fastaweight": 93 elif type == "tabular2fastaweight":
103 readfastaeighted_writefastaweighted(input, output) 94 readtabular_writefastaweighted(input, output)
104 elif type == "fastaweight2fasta": 95 elif type == "fastaweight2fastaweight":
105 readfastaeighted_writefasta(input, output) 96 readfastaweighted_writefastaweighted(input, output)
97 elif type == "fastaweight2fasta":
98 readfastaweighted_writefasta(input, output)
99
106 100
107 if __name__ == "__main__": 101 if __name__ == "__main__":
108 seqdic = defaultdict(int) 102 seqdic = defaultdict(int)
109 args = Parser() 103 args = Parser()
110 main (args.input, args.output, args.type) 104 log = logging.getLogger(__name__)
105 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
106 main(args.input, args.output, args.type)