# HG changeset patch # User bornea # Date 1471394184 14400 # Node ID c9e6e2e7697c7b910025946b5db2d6199f222e44 # Parent 6794a638a50443765170d91a63b44f0c27dd0cdd Uploaded diff -r 6794a638a504 -r c9e6e2e7697c filter_fasta.py --- a/filter_fasta.py Sat Aug 06 17:15:14 2016 -0400 +++ b/filter_fasta.py Tue Aug 16 20:36:24 2016 -0400 @@ -29,7 +29,7 @@ header_start = 0 prot_start = 0 for i in data: - if "Accession" in i: # finds the start of header + if "Accession Number" in i: # finds the start of header header_start = cnt break cnt += 1 @@ -39,8 +39,9 @@ elif "Accession" in header: prot_start = header.index("Accession") proteins = [] - for protein in data[1:]: - proteins.append(protein[prot_start]) + for protein in data[header_start:]: + if len(protein) > prot_start: + proteins.append(protein[prot_start]) return proteins def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID input_data = readtab(infile) @@ -54,7 +55,6 @@ if flag == 1: # once we have a hit, start adding the sequences if ">" not in i[0]: # don't add the headers to the sequence temp.append(i[0]) - #print temp if i[0].startswith(">"): # is it a fasta header? if temp != []: # if it is a continued fasta header, add old sequences to the sequence list # will this cutoff the last on of the file? @@ -84,8 +84,10 @@ x.write(i+'\n'+seq[cnt]+'\n') cnt+=1 x.close() -fasta = sys.argv[1] # fasta file to filter -data = sys.argv[2] # scaffold report #2 -- filename +#fasta = sys.argv[1] # fasta file to filter +#data = sys.argv[2] # scaffold report #2 -- filename +fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta" +data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt" FilterFastaSeq(fasta,getAccessions(data)) -os.rename("output.txt", sys.argv[3]) \ No newline at end of file +os.rename("output.txt", "output1.txt") \ No newline at end of file