Mercurial > repos > drosofff > msp_fasta_tabular_converter
comparison fasta_tabular_converter.py @ 3:36388b666bfc draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit b6de14061c479f0418cd89e26d6f5ac26e565a07
| author | drosofff |
|---|---|
| date | Wed, 09 Nov 2016 11:24:13 -0500 |
| parents | 330dd8a8c31a |
| children |
comparison
equal
deleted
inserted
replaced
| 2:330dd8a8c31a | 3:36388b666bfc |
|---|---|
| 1 #!/usr/bin/python | 1 #!/usr/bin/env python |
| 2 # | 2 # |
| 3 import argparse | |
| 4 import logging | |
| 3 import sys | 5 import sys |
| 4 import string | |
| 5 import argparse | |
| 6 from collections import defaultdict | 6 from collections import defaultdict |
| 7 | |
| 7 | 8 |
| 8 def Parser(): | 9 def Parser(): |
| 9 the_parser = argparse.ArgumentParser() | 10 the_parser = argparse.ArgumentParser() |
| 10 the_parser.add_argument( | 11 the_parser.add_argument( |
| 11 '--input', action="store", type=str, help="input file") | 12 '--input', action="store", type=str, help="input file") |
| 14 the_parser.add_argument( | 15 the_parser.add_argument( |
| 15 '--type', action="store", type=str, help="type of convertion") | 16 '--type', action="store", type=str, help="type of convertion") |
| 16 args = the_parser.parse_args() | 17 args = the_parser.parse_args() |
| 17 return args | 18 return args |
| 18 | 19 |
| 20 | |
| 19 def readfasta_writetabular(fasta, tabular, mode="oneline"): | 21 def readfasta_writetabular(fasta, tabular, mode="oneline"): |
| 20 F = open(fasta, "r") | 22 for line in fasta: |
| 21 for line in F: | |
| 22 if line[0] == ">": | 23 if line[0] == ">": |
| 23 try: | 24 try: |
| 24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable | 25 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable |
| 25 except: pass | 26 except NameError: |
| 26 stringlist=[] | 27 pass |
| 28 stringlist = [] | |
| 27 else: | 29 else: |
| 28 stringlist.append(line[:-1]) | 30 try: |
| 31 stringlist.append(line[:-1]) | |
| 32 except UnboundLocalError: # if file went through filter and contains only empty lines | |
| 33 logging.info("first line is empty.") | |
| 29 try: | 34 try: |
| 30 seqdic["".join(stringlist)] += 1 # for the last sequence | 35 seqdic["".join(stringlist)] += 1 # for the last sequence |
| 31 except: pass # in case file to convert is empty | 36 except NameError: |
| 32 F.close() | 37 logging.info("input file has not fasta sequences.") |
| 33 F = open(tabular, "w") | |
| 34 for seq in sorted(seqdic, key=seqdic.get, reverse=True): | 38 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
| 35 print >> F, "%s\t%s" % (seq, seqdic[seq]) | 39 tabular.write("%s\t%s\n" % (seq, seqdic[seq])) |
| 36 F.close() | 40 |
| 37 | 41 |
| 38 | |
| 39 def readtabular_writefasta(tabular, fasta): | 42 def readtabular_writefasta(tabular, fasta): |
| 40 F = open(tabular, "r") | 43 counter = 0 |
| 41 Fw = open(fasta, "w") | 44 for line in tabular: |
| 42 counter = 0 | 45 fields = line.split() |
| 43 for line in F: | 46 for i in range(int(fields[1])): |
| 44 fields = line.split() | 47 counter += 1 |
| 45 for i in range(int(fields[1])): | 48 fasta.write(">%s\n%s\n" % (counter, fields[0])) |
| 46 counter += 1 | |
| 47 print >> Fw, ">%s\n%s" % (counter, fields[0]) | |
| 48 F.close() | |
| 49 Fw.close() | |
| 50 | 49 |
| 51 def readtabular_writefastaweighted (tabular, fasta): | |
| 52 F = open(tabular, "r") | |
| 53 Fw = open(fasta, "w") | |
| 54 counter = 0 | |
| 55 for line in F: | |
| 56 counter += 1 | |
| 57 fields = line[:-1].split() | |
| 58 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0]) | |
| 59 F.close() | |
| 60 Fw.close() | |
| 61 | 50 |
| 62 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): | 51 def readtabular_writefastaweighted(tabular, fasta): |
| 63 F = open(fastaweigthed_input, "r") | 52 counter = 0 |
| 64 number_reads = 0 | 53 for line in tabular: |
| 65 for line in F: | 54 counter += 1 |
| 66 if line[0] == ">": | 55 fields = line[:-1].split() |
| 67 weigth = int(line[1:-1].split("_")[-1]) | 56 fasta.write(">%s_%s\n%s\n" % (counter, fields[1], fields[0])) |
| 68 number_reads += weigth | |
| 69 else: | |
| 70 seqdic[line[:-1]] += weigth | |
| 71 F.close() | |
| 72 F = open(fastaweigthed_reparsed, "w") | |
| 73 n=0 | |
| 74 for seq in sorted(seqdic, key=seqdic.get, reverse=True): | |
| 75 n += 1 | |
| 76 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq) | |
| 77 F.close() | |
| 78 print "%s reads collapsed" % number_reads | |
| 79 | 57 |
| 80 def readfastaeighted_writefasta(fastaweigthed, fasta): | 58 |
| 81 F = open(fastaweigthed, "r") | 59 def readfastaweighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): |
| 82 Fw = open(fasta, "w") | 60 number_reads = 0 |
| 83 counter = 0 | 61 for line in fastaweigthed_input: |
| 84 for line in F: | 62 if line[0] == ">": |
| 85 if line[0] == ">": | 63 weigth = int(line[1:-1].split("_")[-1]) |
| 86 weigth = int(line[1:-1].split("_")[-1]) | 64 number_reads += weigth |
| 87 else: | 65 else: |
| 88 seq = line[:-1] | 66 seqdic[line[:-1]] += weigth |
| 89 for i in range (weigth): | 67 n = 0 |
| 90 counter += 1 | 68 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
| 91 print >> Fw, ">%s\n%s" % (counter, seq) | 69 n += 1 |
| 92 F.close() | 70 fastaweigthed_reparsed.write(">%s_%s\n%s\n" % (n, seqdic[seq], seq)) |
| 93 Fw.close() | 71 log.info("%s reads collapsed" % number_reads) |
| 72 | |
| 73 | |
| 74 def readfastaweighted_writefasta(fastaweigthed, fasta): | |
| 75 counter = 0 | |
| 76 for line in fastaweigthed: | |
| 77 if line[0] == ">": | |
| 78 weigth = int(line[1:-1].split("_")[-1]) | |
| 79 else: | |
| 80 seq = line[:-1] | |
| 81 for i in range(weigth): | |
| 82 counter += 1 | |
| 83 fasta.write(">%s\n%s\n" % (counter, seq)) | |
| 84 | |
| 94 | 85 |
| 95 def main(input, output, type): | 86 def main(input, output, type): |
| 96 if type == "fasta2tabular": | 87 with open(input, "r") as input: |
| 97 readfasta_writetabular(input, output) | 88 with open(output, "w") as output: |
| 98 elif type == "tabular2fasta": | 89 if type == "fasta2tabular": |
| 99 readtabular_writefasta(input, output) | 90 readfasta_writetabular(input, output) |
| 100 elif type == "tabular2fastaweight": | 91 elif type == "tabular2fasta": |
| 101 readtabular_writefastaweighted (input, output) | 92 readtabular_writefasta(input, output) |
| 102 elif type == "fastaweight2fastaweight": | 93 elif type == "tabular2fastaweight": |
| 103 readfastaeighted_writefastaweighted(input, output) | 94 readtabular_writefastaweighted(input, output) |
| 104 elif type == "fastaweight2fasta": | 95 elif type == "fastaweight2fastaweight": |
| 105 readfastaeighted_writefasta(input, output) | 96 readfastaweighted_writefastaweighted(input, output) |
| 97 elif type == "fastaweight2fasta": | |
| 98 readfastaweighted_writefasta(input, output) | |
| 99 | |
| 106 | 100 |
| 107 if __name__ == "__main__": | 101 if __name__ == "__main__": |
| 108 seqdic = defaultdict(int) | 102 seqdic = defaultdict(int) |
| 109 args = Parser() | 103 args = Parser() |
| 110 main (args.input, args.output, args.type) | 104 log = logging.getLogger(__name__) |
| 105 logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
| 106 main(args.input, args.output, args.type) |
