Mercurial > repos > bornea > filter_fasta
comparison filter_fasta.py @ 10:c9e6e2e7697c draft
Uploaded
author | bornea |
---|---|
date | Tue, 16 Aug 2016 20:36:24 -0400 |
parents | 6794a638a504 |
children | 573c36ff075f |
comparison
equal
deleted
inserted
replaced
9:6794a638a504 | 10:c9e6e2e7697c |
---|---|
27 data = readtab(infile) | 27 data = readtab(infile) |
28 cnt = 0 | 28 cnt = 0 |
29 header_start = 0 | 29 header_start = 0 |
30 prot_start = 0 | 30 prot_start = 0 |
31 for i in data: | 31 for i in data: |
32 if "Accession" in i: # finds the start of header | 32 if "Accession Number" in i: # finds the start of header |
33 header_start = cnt | 33 header_start = cnt |
34 break | 34 break |
35 cnt += 1 | 35 cnt += 1 |
36 header = data[header_start] | 36 header = data[header_start] |
37 if "Accession Number" in header: | 37 if "Accession Number" in header: |
38 prot_start = header.index("Accession Number") | 38 prot_start = header.index("Accession Number") |
39 elif "Accession" in header: | 39 elif "Accession" in header: |
40 prot_start = header.index("Accession") | 40 prot_start = header.index("Accession") |
41 proteins = [] | 41 proteins = [] |
42 for protein in data[1:]: | 42 for protein in data[header_start:]: |
43 proteins.append(protein[prot_start]) | 43 if len(protein) > prot_start: |
44 proteins.append(protein[prot_start]) | |
44 return proteins | 45 return proteins |
45 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID | 46 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID |
46 input_data = readtab(infile) | 47 input_data = readtab(infile) |
47 seq=[] | 48 seq=[] |
48 header=[] | 49 header=[] |
52 for i in input_data: | 53 for i in input_data: |
53 cnt+=1 | 54 cnt+=1 |
54 if flag == 1: # once we have a hit, start adding the sequences | 55 if flag == 1: # once we have a hit, start adding the sequences |
55 if ">" not in i[0]: # don't add the headers to the sequence | 56 if ">" not in i[0]: # don't add the headers to the sequence |
56 temp.append(i[0]) | 57 temp.append(i[0]) |
57 #print temp | |
58 if i[0].startswith(">"): # is it a fasta header? | 58 if i[0].startswith(">"): # is it a fasta header? |
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list | 59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list |
60 # will this cutoff the last on of the file? | 60 # will this cutoff the last on of the file? |
61 merged = "\n".join(temp) | 61 merged = "\n".join(temp) |
62 if merged!="": | 62 if merged!="": |
82 x = open("output.txt","w") | 82 x = open("output.txt","w") |
83 for i in header: | 83 for i in header: |
84 x.write(i+'\n'+seq[cnt]+'\n') | 84 x.write(i+'\n'+seq[cnt]+'\n') |
85 cnt+=1 | 85 cnt+=1 |
86 x.close() | 86 x.close() |
87 fasta = sys.argv[1] # fasta file to filter | 87 #fasta = sys.argv[1] # fasta file to filter |
88 data = sys.argv[2] # scaffold report #2 -- filename | 88 #data = sys.argv[2] # scaffold report #2 -- filename |
89 fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta" | |
90 data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt" | |
89 | 91 |
90 FilterFastaSeq(fasta,getAccessions(data)) | 92 FilterFastaSeq(fasta,getAccessions(data)) |
91 os.rename("output.txt", sys.argv[3]) | 93 os.rename("output.txt", "output1.txt") |