comparison filter_fasta.py @ 1:b25b0f210e10 draft

Uploaded
author bornea
date Thu, 04 Aug 2016 12:08:49 -0400
parents
children af454e5a9ef5
comparison
equal deleted inserted replaced
0:c8b26d6d385b 1:b25b0f210e10
1 # -*- coding: utf-8 -*-
2 """
3 Python-code: Merge Scaffold Samples Report files
4 @author = Brent Kuenzi
5 @email = Brent.Kuenzi@moffitt.org
6 """
7 #######################################################################################
8 ## Description: ##
9 # This program will filter a fasta file using a data file
10 # containing an "Accession" or "Accession Number" column
11 ## Required input: ##
12 # 1) fasta file to be filtered
13 # 2) data file containing a "Accession" or "Accession Number" column
14
15
16 import sys
17 import os
18 def readtab(infile): # read in tab-delim text
19 with open(infile,'r') as x:
20 output = []
21 for line in x:
22 line = line.strip()
23 temp = line.split('\t')
24 output.append(temp)
25 return output
26 def getAccessions(infile): # get list of protein accessions from your data
27 data = readtab(infile)
28 cnt = 0
29 header_start = 0
30 prot_start = 0
31 for i in data:
32 if "Accession" in i: # finds the start of header
33 header_start = cnt
34 break
35 cnt += 1
36 header = data[header_start]
37 if "Accession Number" in header:
38 prot_start = header.index("Accession Number")
39 elif "Accession" in header:
40 prot_start = header.index("Accession")
41 proteins = []
42 for protein in data[1:]:
43 proteins.append(protein[prot_start])
44 return proteins
45 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
46 input_data = readtab(infile)
47 seq=[]
48 header=[]
49 temp=[]
50 flag = 0
51 cnt = 0
52 for i in input_data:
53 cnt+=1
54 if flag == 1: # once we have a hit, start adding the sequences
55 if ">" not in i[0]: # don't add the headers to the sequence
56 temp.append(i[0])
57 #print temp
58 if i[0].startswith(">"): # is it a fasta header?
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
60 # will this cutoff the last on of the file?
61 merged = "\n".join(temp)
62 if merged!="":
63 seq.append(merged)
64 temp=[]
65 lst = i[0].split('|')
66 ID1 = lst[1]
67 lst2 = lst[2].split(' ')
68 ID2 = lst2[0]
69 flag = 0
70 if ID1 in accession: # are the IDs part of your data?
71 flag+=1
72 header.append(i[0]) # if it is then append it
73 if ID2 in accession: # are the IDs part of your data?
74 flag+=1
75 header.append(i[0]) # if it is then append it
76 if cnt == len(input_data): # account for last fasta sequence in file
77 if temp != []:
78 merged = "\n".join(temp)
79 if merged!="":
80 seq.append(merged)
81 cnt=0
82 with open("output.fasta","wt") as x:
83 for i in header:
84 x.write(i+'\n'+seq[cnt]+'\n')
85 cnt+=1
86 fasta = sys.argv[1] # fasta file to filter
87 data = sys.argv[2] # scaffold report #2 -- filename
88
89 FilterFastaSeq(fasta,getAccessions(data))
90 os.rename("output.fasta", sys.argv[3])