annotate filter_fasta.py @ 7:e01b49c112b0 draft

Uploaded
author bornea
date Sat, 06 Aug 2016 17:10:23 -0400
parents 5990c4dbfbaa
children afcea0f23a55
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
af454e5a9ef5 Uploaded
bornea
parents: 1
diff changeset
1
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
2 """
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
3 Python-code: Merge Scaffold Samples Report files
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
4 @author = Brent Kuenzi
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
5 @email = Brent.Kuenzi@moffitt.org
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
6 """
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
7 #######################################################################################
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
8 ## Description: ##
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
9 # This program will filter a fasta file using a data file
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
10 # containing an "Accession" or "Accession Number" column
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
11 ## Required input: ##
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
12 # 1) fasta file to be filtered
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
13 # 2) data file containing a "Accession" or "Accession Number" column
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
14
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
15
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
16 import sys
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
17 import os
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
18 def readtab(infile): # read in tab-delim text
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
19 with open(infile,'r') as x:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
20 output = []
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
21 for line in x:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
22 line = line.strip()
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
23 temp = line.split('\t')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
24 output.append(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
25 return output
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
26 def getAccessions(infile): # get list of protein accessions from your data
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
27 data = readtab(infile)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
28 cnt = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
29 header_start = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
30 prot_start = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
31 for i in data:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
32 if "Accession" in i: # finds the start of header
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
33 header_start = cnt
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
34 break
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
35 cnt += 1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
36 header = data[header_start]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
37 if "Accession Number" in header:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
38 prot_start = header.index("Accession Number")
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
39 elif "Accession" in header:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
40 prot_start = header.index("Accession")
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
41 proteins = []
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
42 for protein in data[1:]:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
43 proteins.append(protein[prot_start])
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
44 return proteins
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
45 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
46 input_data = readtab(infile)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
47 seq=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
48 header=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
49 temp=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
50 flag = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
51 cnt = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
52 for i in input_data:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
53 cnt+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
54 if flag == 1: # once we have a hit, start adding the sequences
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
55 if ">" not in i[0]: # don't add the headers to the sequence
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
56 temp.append(i[0])
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
57 #print temp
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
58 if i[0].startswith(">"): # is it a fasta header?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
60 # will this cutoff the last on of the file?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
61 merged = "\n".join(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
62 if merged!="":
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
63 seq.append(merged)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
64 temp=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
65 lst = i[0].split('|')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
66 ID1 = lst[1]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
67 lst2 = lst[2].split(' ')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
68 ID2 = lst2[0]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
69 flag = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
70 if ID1 in accession: # are the IDs part of your data?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
71 flag+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
72 header.append(i[0]) # if it is then append it
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
73 if ID2 in accession: # are the IDs part of your data?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
74 flag+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
75 header.append(i[0]) # if it is then append it
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
76 if cnt == len(input_data): # account for last fasta sequence in file
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
77 if temp != []:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
78 merged = "\n".join(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
79 if merged!="":
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
80 seq.append(merged)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
81 cnt=0
7
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
82 x = open("output.fasta","w"):
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
83 for i in header:
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
84 x.write(i+'\n'+seq[cnt]+'\n')
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
85 cnt+=1
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
86 x.close()
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
87 fasta = sys.argv[1] # fasta file to filter
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
88 data = sys.argv[2] # scaffold report #2 -- filename
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
89
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
90 FilterFastaSeq(fasta,getAccessions(data))
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
91 os.rename("output.fasta", sys.argv[3])