Mercurial > repos > bornea > filter_fasta
comparison filter_fasta.py @ 1:b25b0f210e10 draft
Uploaded
author | bornea |
---|---|
date | Thu, 04 Aug 2016 12:08:49 -0400 |
parents | |
children | af454e5a9ef5 |
comparison
equal
deleted
inserted
replaced
0:c8b26d6d385b | 1:b25b0f210e10 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Python-code: Merge Scaffold Samples Report files | |
4 @author = Brent Kuenzi | |
5 @email = Brent.Kuenzi@moffitt.org | |
6 """ | |
7 ####################################################################################### | |
8 ## Description: ## | |
9 # This program will filter a fasta file using a data file | |
10 # containing an "Accession" or "Accession Number" column | |
11 ## Required input: ## | |
12 # 1) fasta file to be filtered | |
13 # 2) data file containing a "Accession" or "Accession Number" column | |
14 | |
15 | |
16 import sys | |
17 import os | |
18 def readtab(infile): # read in tab-delim text | |
19 with open(infile,'r') as x: | |
20 output = [] | |
21 for line in x: | |
22 line = line.strip() | |
23 temp = line.split('\t') | |
24 output.append(temp) | |
25 return output | |
26 def getAccessions(infile): # get list of protein accessions from your data | |
27 data = readtab(infile) | |
28 cnt = 0 | |
29 header_start = 0 | |
30 prot_start = 0 | |
31 for i in data: | |
32 if "Accession" in i: # finds the start of header | |
33 header_start = cnt | |
34 break | |
35 cnt += 1 | |
36 header = data[header_start] | |
37 if "Accession Number" in header: | |
38 prot_start = header.index("Accession Number") | |
39 elif "Accession" in header: | |
40 prot_start = header.index("Accession") | |
41 proteins = [] | |
42 for protein in data[1:]: | |
43 proteins.append(protein[prot_start]) | |
44 return proteins | |
45 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID | |
46 input_data = readtab(infile) | |
47 seq=[] | |
48 header=[] | |
49 temp=[] | |
50 flag = 0 | |
51 cnt = 0 | |
52 for i in input_data: | |
53 cnt+=1 | |
54 if flag == 1: # once we have a hit, start adding the sequences | |
55 if ">" not in i[0]: # don't add the headers to the sequence | |
56 temp.append(i[0]) | |
57 #print temp | |
58 if i[0].startswith(">"): # is it a fasta header? | |
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list | |
60 # will this cutoff the last on of the file? | |
61 merged = "\n".join(temp) | |
62 if merged!="": | |
63 seq.append(merged) | |
64 temp=[] | |
65 lst = i[0].split('|') | |
66 ID1 = lst[1] | |
67 lst2 = lst[2].split(' ') | |
68 ID2 = lst2[0] | |
69 flag = 0 | |
70 if ID1 in accession: # are the IDs part of your data? | |
71 flag+=1 | |
72 header.append(i[0]) # if it is then append it | |
73 if ID2 in accession: # are the IDs part of your data? | |
74 flag+=1 | |
75 header.append(i[0]) # if it is then append it | |
76 if cnt == len(input_data): # account for last fasta sequence in file | |
77 if temp != []: | |
78 merged = "\n".join(temp) | |
79 if merged!="": | |
80 seq.append(merged) | |
81 cnt=0 | |
82 with open("output.fasta","wt") as x: | |
83 for i in header: | |
84 x.write(i+'\n'+seq[cnt]+'\n') | |
85 cnt+=1 | |
86 fasta = sys.argv[1] # fasta file to filter | |
87 data = sys.argv[2] # scaffold report #2 -- filename | |
88 | |
89 FilterFastaSeq(fasta,getAccessions(data)) | |
90 os.rename("output.fasta", sys.argv[3]) |