comparison filter_fasta.py @ 10:c9e6e2e7697c draft

Uploaded
author bornea
date Tue, 16 Aug 2016 20:36:24 -0400
parents 6794a638a504
children 573c36ff075f
comparison
equal deleted inserted replaced
9:6794a638a504 10:c9e6e2e7697c
27 data = readtab(infile) 27 data = readtab(infile)
28 cnt = 0 28 cnt = 0
29 header_start = 0 29 header_start = 0
30 prot_start = 0 30 prot_start = 0
31 for i in data: 31 for i in data:
32 if "Accession" in i: # finds the start of header 32 if "Accession Number" in i: # finds the start of header
33 header_start = cnt 33 header_start = cnt
34 break 34 break
35 cnt += 1 35 cnt += 1
36 header = data[header_start] 36 header = data[header_start]
37 if "Accession Number" in header: 37 if "Accession Number" in header:
38 prot_start = header.index("Accession Number") 38 prot_start = header.index("Accession Number")
39 elif "Accession" in header: 39 elif "Accession" in header:
40 prot_start = header.index("Accession") 40 prot_start = header.index("Accession")
41 proteins = [] 41 proteins = []
42 for protein in data[1:]: 42 for protein in data[header_start:]:
43 proteins.append(protein[prot_start]) 43 if len(protein) > prot_start:
44 proteins.append(protein[prot_start])
44 return proteins 45 return proteins
45 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID 46 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
46 input_data = readtab(infile) 47 input_data = readtab(infile)
47 seq=[] 48 seq=[]
48 header=[] 49 header=[]
52 for i in input_data: 53 for i in input_data:
53 cnt+=1 54 cnt+=1
54 if flag == 1: # once we have a hit, start adding the sequences 55 if flag == 1: # once we have a hit, start adding the sequences
55 if ">" not in i[0]: # don't add the headers to the sequence 56 if ">" not in i[0]: # don't add the headers to the sequence
56 temp.append(i[0]) 57 temp.append(i[0])
57 #print temp
58 if i[0].startswith(">"): # is it a fasta header? 58 if i[0].startswith(">"): # is it a fasta header?
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list 59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
60 # will this cutoff the last on of the file? 60 # will this cutoff the last on of the file?
61 merged = "\n".join(temp) 61 merged = "\n".join(temp)
62 if merged!="": 62 if merged!="":
82 x = open("output.txt","w") 82 x = open("output.txt","w")
83 for i in header: 83 for i in header:
84 x.write(i+'\n'+seq[cnt]+'\n') 84 x.write(i+'\n'+seq[cnt]+'\n')
85 cnt+=1 85 cnt+=1
86 x.close() 86 x.close()
87 fasta = sys.argv[1] # fasta file to filter 87 #fasta = sys.argv[1] # fasta file to filter
88 data = sys.argv[2] # scaffold report #2 -- filename 88 #data = sys.argv[2] # scaffold report #2 -- filename
89 fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta"
90 data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt"
89 91
90 FilterFastaSeq(fasta,getAccessions(data)) 92 FilterFastaSeq(fasta,getAccessions(data))
91 os.rename("output.txt", sys.argv[3]) 93 os.rename("output.txt", "output1.txt")