Mercurial > repos > galaxyp > filter_by_fasta_ids

--- a/README.md	Tue May 24 13:05:22 2016 -0400
+++ b/README.md	Sat Apr 28 03:49:28 2018 -0400
@@ -1,5 +1,5 @@
-GalaxyP - Filter by FASTA IDs
-=============================
+GalaxyP - Filter FASTA
+======================

 * Home: <https://github.com/galaxyproteomics/tools-galaxyp/>
 * Galaxy Tool Shed: <http://toolshed.g2.bx.psu.edu/view/galaxyp/filter_by_fasta_ids>
@@ -9,7 +9,7 @@
 Description
 -----------

-Extract sequences from a FASTA file based on a list of IDs.
+Filter entries of a FASTA file on the headers and/or the sequences based on various criteria.


 GalaxyP Community
@@ -23,7 +23,7 @@
 License
 -------

-Copyright (c) 2014 Regents of the University of Minnesota and Authors listed below.
+Copyright (c) 2014,2018 Regents of the University of Minnesota and Authors listed below.

 To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty.

@@ -44,4 +44,6 @@
 Authors and contributors:

 * John Chilton <jmchilton@gmail.com>
-* Minnesota Supercomputing Institute, Univeristy of Minnesota
+* Minnesota Supercomputing Institute, University of Minnesota
+* Nicola Soranzo <nicola.soranzo@earlham.ac.uk>
+* Earlham Institute, Norwich, UK
--- a/filter_by_fasta_ids.py	Tue May 24 13:05:22 2016 -0400
+++ b/filter_by_fasta_ids.py	Sat Apr 28 03:49:28 2018 -0400
@@ -1,100 +1,135 @@
 #!/usr/bin/env python
 """ A script to build specific fasta databases """
 from __future__ import print_function
-import optparse
+
+import argparse
+import re
+import sys


-# ===================================== Iterator ===============================
-class Sequence:
-    ''' Holds protein sequence information '''
-    def __init__(self):
-        self.header = ""
-        self.sequence_parts = []
+class Sequence(object):
+    def __init__(self, header, sequence_parts):
+        self.header = header
+        self.sequence_parts = sequence_parts
+        self._sequence = None

-    def get_sequence(self):
-        return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
+    @property
+    def sequence(self):
+        if self._sequence is None:
+            self._sequence = ''.join(self.sequence_parts)
+        return self._sequence
+
+    def print(self, fh=sys.stdout):
+        print(self.header, file=fh)
+        for line in self.sequence_parts:
+            print(line, file=fh)


-class FASTAReader:
-    """
-        FASTA db iterator. Returns a single FASTA sequence object.
-    """
-    def __init__(self, fasta_name):
-        self.fasta_file = open(fasta_name)
-        self.next_line = self.fasta_file.readline()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        ''' Iteration '''
-        next_line = self.next_line
-        if not next_line:
-            raise StopIteration
-
-        seq = Sequence()
-        seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
-
-        next_line = self.fasta_file.readline()
-        while next_line and next_line[0] != '>':
-            seq.sequence_parts.append(next_line)
-            next_line = self.fasta_file.readline()
-        self.next_line = next_line
-        return seq
-
-    # Python 2/3 compat
-    next = __next__
+def FASTAReader_gen(fasta_filename):
+    with open(fasta_filename) as fasta_file:
+        line = fasta_file.readline()
+        while True:
+            if not line:
+                return
+            assert line.startswith('>'), "FASTA headers must start with >"
+            header = line.rstrip()
+            sequence_parts = []
+            line = fasta_file.readline()
+            while line and line[0] != '>':
+                sequence_parts.append(line.rstrip())
+                line = fasta_file.readline()
+            yield Sequence(header, sequence_parts)


-def target_match(target, search_entry):
+def target_match(targets, header):
     ''' Matches '''
-    search_entry = search_entry.upper()
-    for atarget in target:
-        if search_entry.find(atarget) > -1:
-            return atarget
+    # Remove '>' and initial spaces from the header
+    header = header[1:].lstrip().upper()
+    # Search for an exact match among the targets
+    if header in targets:
+        return header
+    # Try to find an exact match for the first "word" in the header
+    header = header.split()[0]
+    if header in targets:
+        return header
     return None


 def main():
     ''' the main function'''

-    parser = optparse.OptionParser()
-    parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
-    (options, args) = parser.parse_args()
-
-    targets = []
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', required=True, help='Path to input FASTA file')
+    parser.add_argument('-o', required=True, help='Path to output FASTA file')
+    parser.add_argument('-d', help='Path to discarded entries file')
+    header_criteria = parser.add_mutually_exclusive_group()
+    header_criteria.add_argument('--id_list', help='Path to the ID list file')
+    header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
+    sequence_criteria = parser.add_mutually_exclusive_group()
+    sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
+    sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match')
+    parser.add_argument('--max_length', type=int, help='Maximum sequence length')
+    parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
+    options = parser.parse_args()

-    with open(args[0]) as f_target:
-        for line in f_target.readlines():
-            targets.append(">%s" % line.strip().upper())
+    if options.min_length is not None and options.max_length is None:
+        options.max_length = sys.maxsize
+    if options.header_regexp:
+        regexp = re.compile(options.header_regexp)
+    if options.sequence_regexp:
+        regexp = re.compile(options.sequence_regexp)

-    print('Read target file, now looking for %d sequences.' % len(targets))
+    work_summary = {'found': 0}

-    work_summary = {'wanted': len(targets), 'found': 0}
     if options.dedup:
         used_sequences = set()
         work_summary['duplicates'] = 0
-    homd_db = FASTAReader(args[1])

-    with open(args[2], "w") as output:
+    if options.id_list:
+        targets = []
+        with open(options.id_list) as f_target:
+            for line in f_target.readlines():
+                targets.append(line.strip().upper())
+        work_summary['wanted'] = len(targets)
+
+    homd_db = FASTAReader_gen(options.i)
+    if options.d:
+        discarded = open(options.d, 'w')
+
+    with open(options.o, "w") as output:
         for entry in homd_db:
-            target_matched_results = target_match(targets, entry.header)
-            if target_matched_results:
-                work_summary['found'] += 1
-                targets.remove(target_matched_results)
-                sequence = entry.get_sequence()
+            print_entry = True
+            if options.id_list:
+                target_matched_results = target_match(targets, entry.header)
+                if target_matched_results:
+                    work_summary['found'] += 1
+                    targets.remove(target_matched_results)
+                else:
+                    print_entry = False
+            elif options.header_regexp:
+                if regexp.search(entry.header) is None:
+                    print_entry = False
+            if options.min_length is not None:
+                sequence_length = len(entry.sequence)
+                if not(options.min_length <= sequence_length <= options.max_length):
+                    print_entry = False
+            elif options.sequence_regexp:
+                if regexp.search(entry.sequence) is None:
+                    print_entry = False
+            if print_entry:
                 if options.dedup:
-                    if sequence in used_sequences:
+                    if entry.sequence in used_sequences:
                         work_summary['duplicates'] += 1
                         continue
                     else:
-                        used_sequences.add(sequence)
-                print(entry.header, file=output)
-                print(sequence, file=output)
+                        used_sequences.add(entry.sequence)
+                entry.print(output)
+            elif options.d:
+                entry.print(discarded)

-    print('Completed filtering.')
     for parm, count in work_summary.items():
         print('%s ==> %d' % (parm, count))

+
 if __name__ == "__main__":
     main()
--- a/filter_by_fasta_ids.xml	Tue May 24 13:05:22 2016 -0400
+++ b/filter_by_fasta_ids.xml	Sat Apr 28 03:49:28 2018 -0400
@@ -1,40 +1,142 @@
-<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs">
-    <description>Extract sequences from a FASTA file based on a list of IDs</description>
-    <command>
-<![CDATA[
-        python $__tool_directory__/filter_by_fasta_ids.py
-            $dedup
-            '$identifiers'
-            '$input'
-            '$output'
-]]>
-    </command>
+<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.0">
+    <description>on the headers and/or the sequences</description>
+    <macros>
+        <xml name="regexp_macro" token_label="Regular expression pattern">
+            <param name="regexp" type="text" value="" label="@LABEL@" help="Use the Python regular expression syntax as specified in https://docs.python.org/3/library/re.html">
+                <validator type="empty_field" />
+                <sanitizer>
+                    <valid initial="string.printable">
+                        <remove value="'"/>
+                    </valid>
+                    <mapping initial="none">
+                        <add source="'" target="'&quot;'&quot;'" />
+                    </mapping>
+                </sanitizer>
+            </param>
+        </xml>
+    </macros>
+    <requirements>
+        <requirement type="package" version="3.6.5">python</requirement>
+    </requirements>
+    <command><![CDATA[
+python '$__tool_directory__/filter_by_fasta_ids.py'
+-i '$input'
+#if $header_criteria.header_criteria_select == 'id_list'
+    --id_list '$header_criteria.identifiers'
+#elif $header_criteria.header_criteria_select == 'regexp'
+    --header_regexp '$header_criteria.regexp'
+#end if
+#if $sequence_criteria.sequence_criteria_select == 'seq_length'
+    --min_length $sequence_criteria.min_length
+    #if str($sequence_criteria.max_length)
+        --max_length $sequence_criteria.max_length
+    #end if
+#elif $sequence_criteria.sequence_criteria_select == 'regexp'
+    --sequence_regexp '$sequence_criteria.regexp'
+#end if
+$dedup
+-o '$output'
+#if $output_discarded
+    -d '$discarded'
+#end if
+    ]]></command>
     <inputs>
-        <param format="fasta" name="input" type="data" label="FASTA sequences"/>
-        <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
-        <param name="dedup" type="boolean" truevalue="--dedup" falsevalue="" checked="true" label="Remove duplicate sequences" />
+        <param name="input" type="data" format="fasta" label="FASTA sequences"/>
+        <conditional name="header_criteria">
+            <param name="header_criteria_select" type="select" label="Criteria for filtering on the headers">
+                <option value="">No filtering</option>
+                <option value="id_list">List of IDs</option>
+                <option value="regexp">Regular expression on the headers</option>
+            </param>
+            <when value="" />
+            <when value="id_list">
+                <param name="identifiers" type="data" format="txt" label="List of IDs to extract sequences for"/>
+            </when>
+            <when value="regexp">
+                <expand macro="regexp_macro" label="Regular expression pattern the header should match" />
+            </when>
+        </conditional>
+        <conditional name="sequence_criteria">
+            <param name="sequence_criteria_select" type="select" label="Criteria for filtering on the sequences">
+                <option value="">No filtering</option>
+                <option value="seq_length">Sequence length</option>
+                <option value="regexp">Regular expression on the sequences</option>
+            </param>
+            <when value="" />
+            <when value="seq_length">
+                <param name="min_length" type="integer" value="0" label="Minimum length" />
+                <param name="max_length" type="integer" min="1" value="" optional="true" label="Maximum length" />
+            </when>
+            <when value="regexp">
+                <expand macro="regexp_macro" label="Regular expression pattern the sequence should match" />
+            </when>
+        </conditional>
+        <param name="dedup" type="boolean" truevalue="--dedup" falsevalue="" label="Remove duplicate sequences" />
+        <param name="output_discarded" type="boolean" label="Output discarded FASTA entries" />
     </inputs>
     <outputs>
-        <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
+        <data name="output" format="fasta" label="${tool.name} on ${on_string}: FASTA sequences"/>
+        <data name="discarded" format="fasta" label="${tool.name} on ${on_string}: discarded entries">
+            <filter>output_discarded</filter>
+        </data>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="header_criteria_select" value="id_list" />
             <param name="identifiers" ftype="txt" value="ids.txt" />
+            <param name="dedup" value="True" />
             <output name="output" file="output_dedup.fasta" />
         </test>
-        <test>
+        <test expect_num_outputs="2">
             <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="header_criteria_select" value="id_list" />
             <param name="identifiers" ftype="txt" value="ids.txt" />
             <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
             <output name="output" file="output_not_dedup.fasta" />
+            <output name="discarded" file="discarded_not_dedup.fasta" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="header_criteria_select" value="regexp" />
+            <param name="regexp" value="2" />
+            <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
+            <output name="output" file="output_header_regexp.fasta" />
+            <output name="discarded" file="discarded_header_regexp.fasta" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="sequence_criteria_select" value="seq_length" />
+            <param name="min_length" value="5" />
+            <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
+            <output name="output" file="output_min_length5.fasta" />
+            <output name="discarded" file="discarded_min_length5.fasta" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="sequence_criteria_select" value="seq_length" />
+            <param name="max_length" value="4" />
+            <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
+            <output name="output" file="output_max_length4.fasta" />
+            <output name="discarded" file="discarded_max_length4.fasta" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="sequence_criteria_select" value="regexp" />
+            <param name="regexp" value="T{2,}" />
+            <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
+            <output name="output" file="output_sequence_regexp.fasta" />
+            <output name="discarded" file="discarded_sequence_regexp.fasta" />
         </test>
     </tests>
-    <help>
-<![CDATA[
+    <help><![CDATA[
 **What it does**

-Extract sequences from a FASTA file based on a list of IDs.
-]]>
-    </help>
+Filter entries of a FASTA file on the headers and/or the sequences based on various criteria.
+    ]]></help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_header_regexp.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,9 @@
+>1
+TGAC
+>3
+ACGT
+>4
+ACGT
+TGAC
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_max_length4.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,8 @@
+>2
+AAAAAAAA
+>2_bis
+AAAA
+AAAA
+>4
+ACGT
+TGAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_min_length5.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,6 @@
+>1
+TGAC
+>3
+ACGT
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_not_dedup.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,4 @@
+>1
+TGAC
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_sequence_regexp.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,9 @@
+>1
+TGAC
+>2
+AAAAAAAA
+>3
+ACGT
+>2_bis
+AAAA
+AAAA
--- a/test-data/output_dedup.fasta	Tue May 24 13:05:22 2016 -0400
+++ b/test-data/output_dedup.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -3,4 +3,5 @@
 >3
 ACGT
 >4
-ACGTTGAC
+ACGT
+TGAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_header_regexp.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,5 @@
+>2
+AAAAAAAA
+>2_bis
+AAAA
+AAAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_max_length4.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,6 @@
+>1
+TGAC
+>3
+ACGT
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_min_length5.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,8 @@
+>2
+AAAAAAAA
+>2_bis
+AAAA
+AAAA
+>4
+ACGT
+TGAC
--- a/test-data/output_not_dedup.fasta	Tue May 24 13:05:22 2016 -0400
+++ b/test-data/output_not_dedup.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -3,6 +3,8 @@
 >3
 ACGT
 >2_bis
-AAAAAAAA
+AAAA
+AAAA
 >4
-ACGTTGAC
+ACGT
+TGAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_sequence_regexp.fasta	Sat Apr 28 03:49:28 2018 -0400
@@ -0,0 +1,5 @@
+>4
+ACGT
+TGAC
+>5
+TTTT