Mercurial > repos > bornea > filter_fasta

--- a/filter_fasta.py	Sat Aug 06 17:15:14 2016 -0400
+++ b/filter_fasta.py	Tue Aug 16 20:36:24 2016 -0400
@@ -29,7 +29,7 @@
     header_start = 0
     prot_start = 0
     for i in data:
-        if "Accession" in i: # finds the start of header
+        if "Accession Number" in i: # finds the start of header
             header_start = cnt
             break
         cnt += 1
@@ -39,8 +39,9 @@
     elif "Accession" in header:
         prot_start = header.index("Accession")
     proteins = []
-    for protein in data[1:]:
-        proteins.append(protein[prot_start])
+    for protein in data[header_start:]:
+        if len(protein) > prot_start:
+            proteins.append(protein[prot_start])
     return proteins
 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
     input_data = readtab(infile)
@@ -54,7 +55,6 @@
         if flag == 1: # once we have a hit, start adding the sequences
             if ">" not in i[0]: # don't add the headers to the sequence
                 temp.append(i[0])
-                #print temp
         if i[0].startswith(">"): # is it a fasta header?
             if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
             # will this cutoff the last on of the file?
@@ -84,8 +84,10 @@
         x.write(i+'\n'+seq[cnt]+'\n')
         cnt+=1
     x.close()
-fasta = sys.argv[1] # fasta file to filter
-data = sys.argv[2] # scaffold report #2 -- filename
+#fasta = sys.argv[1] # fasta file to filter
+#data = sys.argv[2] # scaffold report #2 -- filename
+fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta"
+data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt"

 FilterFastaSeq(fasta,getAccessions(data))
-os.rename("output.txt", sys.argv[3])
\ No newline at end of file
+os.rename("output.txt", "output1.txt")
\ No newline at end of file