# HG changeset patch # User bornea # Date 1470326929 14400 # Node ID b25b0f210e10e9120c93046417747717f57f412c # Parent c8b26d6d385b1245260bbb913334909c6402311c Uploaded diff -r c8b26d6d385b -r b25b0f210e10 filter_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_fasta.py Thu Aug 04 12:08:49 2016 -0400 @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +""" +Python-code: Merge Scaffold Samples Report files +@author = Brent Kuenzi +@email = Brent.Kuenzi@moffitt.org +""" +####################################################################################### +## Description: ## +# This program will filter a fasta file using a data file +# containing an "Accession" or "Accession Number" column +## Required input: ## +# 1) fasta file to be filtered +# 2) data file containing a "Accession" or "Accession Number" column + + +import sys +import os +def readtab(infile): # read in tab-delim text + with open(infile,'r') as x: + output = [] + for line in x: + line = line.strip() + temp = line.split('\t') + output.append(temp) + return output +def getAccessions(infile): # get list of protein accessions from your data + data = readtab(infile) + cnt = 0 + header_start = 0 + prot_start = 0 + for i in data: + if "Accession" in i: # finds the start of header + header_start = cnt + break + cnt += 1 + header = data[header_start] + if "Accession Number" in header: + prot_start = header.index("Accession Number") + elif "Accession" in header: + prot_start = header.index("Accession") + proteins = [] + for protein in data[1:]: + proteins.append(protein[prot_start]) + return proteins +def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID + input_data = readtab(infile) + seq=[] + header=[] + temp=[] + flag = 0 + cnt = 0 + for i in input_data: + cnt+=1 + if flag == 1: # once we have a hit, start adding the sequences + if ">" not in i[0]: # don't add the headers to the sequence + temp.append(i[0]) + #print temp + if i[0].startswith(">"): # is it a fasta header? + if temp != []: # if it is a continued fasta header, add old sequences to the sequence list + # will this cutoff the last on of the file? + merged = "\n".join(temp) + if merged!="": + seq.append(merged) + temp=[] + lst = i[0].split('|') + ID1 = lst[1] + lst2 = lst[2].split(' ') + ID2 = lst2[0] + flag = 0 + if ID1 in accession: # are the IDs part of your data? + flag+=1 + header.append(i[0]) # if it is then append it + if ID2 in accession: # are the IDs part of your data? + flag+=1 + header.append(i[0]) # if it is then append it + if cnt == len(input_data): # account for last fasta sequence in file + if temp != []: + merged = "\n".join(temp) + if merged!="": + seq.append(merged) + cnt=0 + with open("output.fasta","wt") as x: + for i in header: + x.write(i+'\n'+seq[cnt]+'\n') + cnt+=1 +fasta = sys.argv[1] # fasta file to filter +data = sys.argv[2] # scaffold report #2 -- filename + +FilterFastaSeq(fasta,getAccessions(data)) +os.rename("output.fasta", sys.argv[3]) \ No newline at end of file