Mercurial > repos > bornea > filter_fasta
changeset 10:c9e6e2e7697c draft
Uploaded
author | bornea |
---|---|
date | Tue, 16 Aug 2016 20:36:24 -0400 |
parents | 6794a638a504 |
children | 573c36ff075f |
files | filter_fasta.py |
diffstat | 1 files changed, 9 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_fasta.py Sat Aug 06 17:15:14 2016 -0400 +++ b/filter_fasta.py Tue Aug 16 20:36:24 2016 -0400 @@ -29,7 +29,7 @@ header_start = 0 prot_start = 0 for i in data: - if "Accession" in i: # finds the start of header + if "Accession Number" in i: # finds the start of header header_start = cnt break cnt += 1 @@ -39,8 +39,9 @@ elif "Accession" in header: prot_start = header.index("Accession") proteins = [] - for protein in data[1:]: - proteins.append(protein[prot_start]) + for protein in data[header_start:]: + if len(protein) > prot_start: + proteins.append(protein[prot_start]) return proteins def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID input_data = readtab(infile) @@ -54,7 +55,6 @@ if flag == 1: # once we have a hit, start adding the sequences if ">" not in i[0]: # don't add the headers to the sequence temp.append(i[0]) - #print temp if i[0].startswith(">"): # is it a fasta header? if temp != []: # if it is a continued fasta header, add old sequences to the sequence list # will this cutoff the last on of the file? @@ -84,8 +84,10 @@ x.write(i+'\n'+seq[cnt]+'\n') cnt+=1 x.close() -fasta = sys.argv[1] # fasta file to filter -data = sys.argv[2] # scaffold report #2 -- filename +#fasta = sys.argv[1] # fasta file to filter +#data = sys.argv[2] # scaffold report #2 -- filename +fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta" +data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt" FilterFastaSeq(fasta,getAccessions(data)) -os.rename("output.txt", sys.argv[3]) \ No newline at end of file +os.rename("output.txt", "output1.txt") \ No newline at end of file