Mercurial > repos > bornea > filter_fasta
view filter_fasta.py @ 10:c9e6e2e7697c draft
Uploaded
author | bornea |
---|---|
date | Tue, 16 Aug 2016 20:36:24 -0400 |
parents | 6794a638a504 |
children | 573c36ff075f |
line wrap: on
line source
""" Python-code: Merge Scaffold Samples Report files @author = Brent Kuenzi @email = Brent.Kuenzi@moffitt.org """ ####################################################################################### ## Description: ## # This program will filter a fasta file using a data file # containing an "Accession" or "Accession Number" column ## Required input: ## # 1) fasta file to be filtered # 2) data file containing a "Accession" or "Accession Number" column import sys import os def readtab(infile): # read in tab-delim text with open(infile,'r') as x: output = [] for line in x: line = line.strip() temp = line.split('\t') output.append(temp) return output def getAccessions(infile): # get list of protein accessions from your data data = readtab(infile) cnt = 0 header_start = 0 prot_start = 0 for i in data: if "Accession Number" in i: # finds the start of header header_start = cnt break cnt += 1 header = data[header_start] if "Accession Number" in header: prot_start = header.index("Accession Number") elif "Accession" in header: prot_start = header.index("Accession") proteins = [] for protein in data[header_start:]: if len(protein) > prot_start: proteins.append(protein[prot_start]) return proteins def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID input_data = readtab(infile) seq=[] header=[] temp=[] flag = 0 cnt = 0 for i in input_data: cnt+=1 if flag == 1: # once we have a hit, start adding the sequences if ">" not in i[0]: # don't add the headers to the sequence temp.append(i[0]) if i[0].startswith(">"): # is it a fasta header? if temp != []: # if it is a continued fasta header, add old sequences to the sequence list # will this cutoff the last on of the file? merged = "\n".join(temp) if merged!="": seq.append(merged) temp=[] lst = i[0].split('|') ID1 = lst[1] lst2 = lst[2].split(' ') ID2 = lst2[0] flag = 0 if ID1 in accession: # are the IDs part of your data? flag+=1 header.append(i[0]) # if it is then append it if ID2 in accession: # are the IDs part of your data? flag+=1 header.append(i[0]) # if it is then append it if cnt == len(input_data): # account for last fasta sequence in file if temp != []: merged = "\n".join(temp) if merged!="": seq.append(merged) cnt=0 x = open("output.txt","w") for i in header: x.write(i+'\n'+seq[cnt]+'\n') cnt+=1 x.close() #fasta = sys.argv[1] # fasta file to filter #data = sys.argv[2] # scaffold report #2 -- filename fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta" data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt" FilterFastaSeq(fasta,getAccessions(data)) os.rename("output.txt", "output1.txt")