annotate fasta_tabular_converter.py @ 1:2f7278120be9 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
author drosofff
date Tue, 22 Mar 2016 18:54:00 -0400
parents 951cb6b3979b
children 330dd8a8c31a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
2 #
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 import sys
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
4 import string
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
5 import argparse
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 from collections import defaultdict
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
8 def Parser():
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
9 the_parser = argparse.ArgumentParser()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
10 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
11 '--input', action="store", type=str, help="input file")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
12 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
13 '--output', action="store", type=str, help="output converted file")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
14 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
15 '--type', action="store", type=str, help="type of convertion")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
16 args = the_parser.parse_args()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
17 return args
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
18
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
19 def readfasta_writetabular(fasta, tabular, mode="oneline"):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
20 F = open(fasta, "r")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
21 for line in F:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
22 if line[0] == ">":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
23 try:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
25 except: pass
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
26 stringlist=[]
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
27 else:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
28 stringlist.append(line[:-1])
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
29 seqdic["".join(stringlist)] += 1 # for the last sequence
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
30 F.close()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
31 F = open(tabular, "w")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
32 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
33 print >> F, "%s\t%s" % (seq, seqdic[seq])
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
34 F.close()
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
35
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
36
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37 def readtabular_writefasta(tabular, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38 F = open(tabular, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
40 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
41 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
42 fields = line.split()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
43 for i in range(int(fields[1])):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
44 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45 print >> Fw, ">%s\n%s" % (counter, fields[0])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 def readtabular_writefastaweighted (tabular, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50 F = open(tabular, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 fields = line[:-1].split()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61 F = open(fastaweigthed_input, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62 number_reads = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 if line[0] == ">":
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 weigth = int(line[1:-1].split("_")[-1])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 number_reads += weigth
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 else:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68 seqdic[line[:-1]] += weigth
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 F = open(fastaweigthed_reparsed, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71 n=0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 n += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq)
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 print "%s reads collapsed" % number_reads
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 def readfastaeighted_writefasta(fastaweigthed, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79 F = open(fastaweigthed, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 if line[0] == ">":
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 weigth = int(line[1:-1].split("_")[-1])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 else:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 seq = line[:-1]
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 for i in range (weigth):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 print >> Fw, ">%s\n%s" % (counter, seq)
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
93 def main(input, output, type):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
94 if type == "fasta2tabular":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
95 readfasta_writetabular(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
96 elif type == "tabular2fasta":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
97 readtabular_writefasta(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
98 elif type == "tabular2fastaweight":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
99 readtabular_writefastaweighted (input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
100 elif type == "fastaweight2fastaweight":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
101 readfastaeighted_writefastaweighted(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
102 elif type == "fastaweight2fasta":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
103 readfastaeighted_writefasta(input, output)
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
104
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
105 if __name__ == "__main__":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
106 seqdic = defaultdict(int)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
107 args = Parser()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
108 main (args.input, args.output, args.type)