Mercurial > repos > galaxyp > filter_by_fasta_ids
changeset 1:8d15aebf55fd draft
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
author | galaxyp |
---|---|
date | Tue, 24 May 2016 13:05:22 -0400 |
parents | 463ebeccb854 |
children | 1bd985f14938 |
files | README.md filter_by_fasta_ids.py filter_by_fasta_ids.xml test-data/.gitkeep test-data/ids.txt test-data/input.fasta test-data/output_dedup.fasta test-data/output_not_dedup.fasta tool-data/.gitkeep tools/filter_by_fasta_ids.py tools/filter_by_fasta_ids.xml |
diffstat | 9 files changed, 177 insertions(+), 130 deletions(-) [+] |
line wrap: on
line diff
--- a/README.md Fri Sep 26 14:23:16 2014 -0400 +++ b/README.md Tue May 24 13:05:22 2016 -0400 @@ -1,7 +1,7 @@ GalaxyP - Filter by FASTA IDs ============================= -* Home: <https://bitbucket.org/galaxyp/filter_by_fasta_ids> +* Home: <https://github.com/galaxyproteomics/tools-galaxyp/> * Galaxy Tool Shed: <http://toolshed.g2.bx.psu.edu/view/galaxyp/filter_by_fasta_ids> * Tool ID: `filter_by_fasta_ids` @@ -15,9 +15,9 @@ GalaxyP Community ----------------- -Current governing community policies for [GalaxyP](https://bitbucket.org/galaxyp/) and other information can be found at: +Current governing community policies for [GalaxyP](https://github.com/galaxyproteomics/) and other information can be found at: -<https://bitbucket.org/galaxyp/galaxyp> +<https://github.com/galaxyproteomics> License @@ -35,7 +35,7 @@ Contributing ------------ -Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://bitbucket.org/galaxyp/galaxyp/CONTRIBUTORS.md> unless you opt-out. +Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://github.com/galaxyproteomics/tools-galaxyp/> Authors
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_fasta_ids.py Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/env python +""" A script to build specific fasta databases """ +from __future__ import print_function +import optparse + + +# ===================================== Iterator =============================== +class Sequence: + ''' Holds protein sequence information ''' + def __init__(self): + self.header = "" + self.sequence_parts = [] + + def get_sequence(self): + return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) + + +class FASTAReader: + """ + FASTA db iterator. Returns a single FASTA sequence object. + """ + def __init__(self, fasta_name): + self.fasta_file = open(fasta_name) + self.next_line = self.fasta_file.readline() + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + next_line = self.next_line + if not next_line: + raise StopIteration + + seq = Sequence() + seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') + + next_line = self.fasta_file.readline() + while next_line and next_line[0] != '>': + seq.sequence_parts.append(next_line) + next_line = self.fasta_file.readline() + self.next_line = next_line + return seq + + # Python 2/3 compat + next = __next__ + + +def target_match(target, search_entry): + ''' Matches ''' + search_entry = search_entry.upper() + for atarget in target: + if search_entry.find(atarget) > -1: + return atarget + return None + + +def main(): + ''' the main function''' + + parser = optparse.OptionParser() + parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') + (options, args) = parser.parse_args() + + targets = [] + + with open(args[0]) as f_target: + for line in f_target.readlines(): + targets.append(">%s" % line.strip().upper()) + + print('Read target file, now looking for %d sequences.' % len(targets)) + + work_summary = {'wanted': len(targets), 'found': 0} + if options.dedup: + used_sequences = set() + work_summary['duplicates'] = 0 + homd_db = FASTAReader(args[1]) + + with open(args[2], "w") as output: + for entry in homd_db: + target_matched_results = target_match(targets, entry.header) + if target_matched_results: + work_summary['found'] += 1 + targets.remove(target_matched_results) + sequence = entry.get_sequence() + if options.dedup: + if sequence in used_sequences: + work_summary['duplicates'] += 1 + continue + else: + used_sequences.add(sequence) + print(entry.header, file=output) + print(sequence, file=output) + + print('Completed filtering.') + for parm, count in work_summary.items(): + print('%s ==> %d' % (parm, count)) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_fasta_ids.xml Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,40 @@ +<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs"> + <description>Extract sequences from a FASTA file based on a list of IDs</description> + <command> +<![CDATA[ + python $__tool_directory__/filter_by_fasta_ids.py + $dedup + '$identifiers' + '$input' + '$output' +]]> + </command> + <inputs> + <param format="fasta" name="input" type="data" label="FASTA sequences"/> + <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/> + <param name="dedup" type="boolean" truevalue="--dedup" falsevalue="" checked="true" label="Remove duplicate sequences" /> + </inputs> + <outputs> + <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/> + </outputs> + <tests> + <test> + <param name="input" ftype="fasta" value="input.fasta" /> + <param name="identifiers" ftype="txt" value="ids.txt" /> + <output name="output" file="output_dedup.fasta" /> + </test> + <test> + <param name="input" ftype="fasta" value="input.fasta" /> + <param name="identifiers" ftype="txt" value="ids.txt" /> + <param name="dedup" value="False" /> + <output name="output" file="output_not_dedup.fasta" /> + </test> + </tests> + <help> +<![CDATA[ +**What it does** + +Extract sequences from a FASTA file based on a list of IDs. +]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ids.txt Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,5 @@ +2 +2_bis +3 +4 +6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,14 @@ +>1 +TGAC +>2 +AAAAAAAA +>3 +ACGT +>2_bis +AAAA +AAAA +>4 +ACGT +TGAC +>5 +TTTT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_dedup.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,6 @@ +>2 +AAAAAAAA +>3 +ACGT +>4 +ACGTTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_not_dedup.fasta Tue May 24 13:05:22 2016 -0400 @@ -0,0 +1,8 @@ +>2 +AAAAAAAA +>3 +ACGT +>2_bis +AAAAAAAA +>4 +ACGTTGAC
--- a/tools/filter_by_fasta_ids.py Fri Sep 26 14:23:16 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -#!/usr/bin/env python -""" A script to build specific fasta databases """ -from __future__ import print_function -import sys -import logging - -#===================================== Iterator =============================== -class Sequence: - ''' Holds protein sequence information ''' - def __init__(self): - self.header = "" - self.sequence_parts = [] - - def get_sequence(self): - return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts]) - -class FASTAReader: - """ - FASTA db iterator. Returns a single FASTA sequence object. - """ - def __init__(self, fasta_name): - self.fasta_file = open(fasta_name) - self.next_line = self.fasta_file.readline() - - def __iter__(self): - return self - - def __next__(self): - ''' Iteration ''' - #while True: - # line = self.fasta_file.readline() - # if not line: - # raise StopIteration - # if line[0] == '>': - # break - next_line = self.next_line - if not next_line: - raise StopIteration - - seq = Sequence() - seq.header = next_line.rstrip().replace('\n','').replace('\r','') - - next_line = self.fasta_file.readline() - while next_line and next_line[0] != '>': - #tail = self.fasta_file.tell() - #line = self.fasta_file.readline() - #if not line: - # break - #if line[0] == '>': - # self.fasta_file.seek(tail) - # break - seq.sequence_parts.append(next_line) - next_line = self.fasta_file.readline() - self.next_line = next_line - return seq - - # Python 2/3 compat - next = __next__ -#============================================================================== - -def target_match(target, search_entry): - ''' Matches ''' - search_entry = search_entry.upper() - for atarget in target: - if search_entry.find(atarget) > -1: - return atarget - return None - - -def main(): - ''' the main function''' - logging.basicConfig(filename='filter_fasta_log', - level=logging.INFO, - format='%(asctime)s :: %(levelname)s :: %(message)s',) - - used_sequences = set() - work_summary = {'wanted': 0, 'found':0, 'duplicates':0} - targets = [] - - f_target = open(sys.argv[1]) - for line in f_target.readlines(): - targets.append(">%s" % line.strip().upper()) - f_target.close() - - logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.') - - work_summary['wanted'] = len(targets) - homd_db = FASTAReader(sys.argv[2]) - - i = 0 - output = open(sys.argv[3], "w") - try: - for entry in homd_db: - target_matched_results = target_match(targets, entry.header) - if target_matched_results: - work_summary['found'] += 1 - targets.remove(target_matched_results) - sequence = entry.get_sequence() - if sequence in used_sequences: - work_summary['duplicates'] += 1 - else: - used_sequences.add(sequence) - print(entry.header, file=output) - print(sequence, file=output) - finally: - output.close() - - logging.info('Completed filtering') - for parm, count in work_summary.items(): - logging.info('%s ==> %d', parm, count) - -if __name__ == "__main__": - main()
--- a/tools/filter_by_fasta_ids.xml Fri Sep 26 14:23:16 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs"> - <description>Extract sequences from a FASTA file based on a list of IDs</description> - <command interpreter="python">filter_by_fasta_ids.py $identifiers $input $output</command> - <inputs> - <param format="fasta" name="input" type="data" label="FASTA sequences"/> - <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/> - </inputs> - <outputs> - <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/> - </outputs> - <help> - </help> -</tool>