Mercurial > repos > galaxyp > filter_by_fasta_ids

--- a/README.md	Fri Sep 26 14:23:16 2014 -0400
+++ b/README.md	Tue May 24 13:05:22 2016 -0400
@@ -1,7 +1,7 @@
 GalaxyP - Filter by FASTA IDs
 =============================

-* Home: <https://bitbucket.org/galaxyp/filter_by_fasta_ids>
+* Home: <https://github.com/galaxyproteomics/tools-galaxyp/>
 * Galaxy Tool Shed: <http://toolshed.g2.bx.psu.edu/view/galaxyp/filter_by_fasta_ids>
 * Tool ID: `filter_by_fasta_ids`

@@ -15,9 +15,9 @@
 GalaxyP Community
 -----------------

-Current governing community policies for [GalaxyP](https://bitbucket.org/galaxyp/) and other information can be found at:
+Current governing community policies for [GalaxyP](https://github.com/galaxyproteomics/) and other information can be found at:

-<https://bitbucket.org/galaxyp/galaxyp>
+<https://github.com/galaxyproteomics>


 License
@@ -35,7 +35,7 @@
 Contributing
 ------------

-Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://bitbucket.org/galaxyp/galaxyp/CONTRIBUTORS.md> unless you opt-out.
+Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://github.com/galaxyproteomics/tools-galaxyp/>


 Authors
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_fasta_ids.py	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+""" A script to build specific fasta databases """
+from __future__ import print_function
+import optparse
+
+
+# ===================================== Iterator ===============================
+class Sequence:
+    ''' Holds protein sequence information '''
+    def __init__(self):
+        self.header = ""
+        self.sequence_parts = []
+
+    def get_sequence(self):
+        return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
+
+
+class FASTAReader:
+    """
+        FASTA db iterator. Returns a single FASTA sequence object.
+    """
+    def __init__(self, fasta_name):
+        self.fasta_file = open(fasta_name)
+        self.next_line = self.fasta_file.readline()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        next_line = self.next_line
+        if not next_line:
+            raise StopIteration
+
+        seq = Sequence()
+        seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
+
+        next_line = self.fasta_file.readline()
+        while next_line and next_line[0] != '>':
+            seq.sequence_parts.append(next_line)
+            next_line = self.fasta_file.readline()
+        self.next_line = next_line
+        return seq
+
+    # Python 2/3 compat
+    next = __next__
+
+
+def target_match(target, search_entry):
+    ''' Matches '''
+    search_entry = search_entry.upper()
+    for atarget in target:
+        if search_entry.find(atarget) > -1:
+            return atarget
+    return None
+
+
+def main():
+    ''' the main function'''
+
+    parser = optparse.OptionParser()
+    parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
+    (options, args) = parser.parse_args()
+
+    targets = []
+
+    with open(args[0]) as f_target:
+        for line in f_target.readlines():
+            targets.append(">%s" % line.strip().upper())
+
+    print('Read target file, now looking for %d sequences.' % len(targets))
+
+    work_summary = {'wanted': len(targets), 'found': 0}
+    if options.dedup:
+        used_sequences = set()
+        work_summary['duplicates'] = 0
+    homd_db = FASTAReader(args[1])
+
+    with open(args[2], "w") as output:
+        for entry in homd_db:
+            target_matched_results = target_match(targets, entry.header)
+            if target_matched_results:
+                work_summary['found'] += 1
+                targets.remove(target_matched_results)
+                sequence = entry.get_sequence()
+                if options.dedup:
+                    if sequence in used_sequences:
+                        work_summary['duplicates'] += 1
+                        continue
+                    else:
+                        used_sequences.add(sequence)
+                print(entry.header, file=output)
+                print(sequence, file=output)
+
+    print('Completed filtering.')
+    for parm, count in work_summary.items():
+        print('%s ==> %d' % (parm, count))
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_fasta_ids.xml	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,40 @@
+<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs">
+    <description>Extract sequences from a FASTA file based on a list of IDs</description>
+    <command>
+<![CDATA[
+        python $__tool_directory__/filter_by_fasta_ids.py
+            $dedup
+            '$identifiers'
+            '$input'
+            '$output'
+]]>
+    </command>
+    <inputs>
+        <param format="fasta" name="input" type="data" label="FASTA sequences"/>
+        <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
+        <param name="dedup" type="boolean" truevalue="--dedup" falsevalue="" checked="true" label="Remove duplicate sequences" />
+    </inputs>
+    <outputs>
+        <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="identifiers" ftype="txt" value="ids.txt" />
+            <output name="output" file="output_dedup.fasta" />
+        </test>
+        <test>
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="identifiers" ftype="txt" value="ids.txt" />
+            <param name="dedup" value="False" />
+            <output name="output" file="output_not_dedup.fasta" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+**What it does**
+
+Extract sequences from a FASTA file based on a list of IDs.
+]]>
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ids.txt	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,5 @@
+2
+2_bis
+3
+4
+6
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,14 @@
+>1
+TGAC
+>2
+AAAAAAAA
+>3
+ACGT
+>2_bis
+AAAA
+AAAA
+>4
+ACGT
+TGAC
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_dedup.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,6 @@
+>2
+AAAAAAAA
+>3
+ACGT
+>4
+ACGTTGAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_not_dedup.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,8 @@
+>2
+AAAAAAAA
+>3
+ACGT
+>2_bis
+AAAAAAAA
+>4
+ACGTTGAC
--- a/tools/filter_by_fasta_ids.py	Fri Sep 26 14:23:16 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-""" A script to build specific fasta databases """
-from __future__ import print_function
-import sys
-import logging
-
-#===================================== Iterator ===============================
-class Sequence:
-    ''' Holds protein sequence information '''
-    def __init__(self):
-        self.header = ""
-        self.sequence_parts = []
-
-    def get_sequence(self):
-        return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts])
-
-class FASTAReader:
-    """
-        FASTA db iterator. Returns a single FASTA sequence object.
-    """
-    def __init__(self, fasta_name):
-        self.fasta_file = open(fasta_name)
-        self.next_line = self.fasta_file.readline()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        ''' Iteration '''
-        #while True:
-        #    line = self.fasta_file.readline()
-        #    if not line:
-        #        raise StopIteration
-        #    if line[0] == '>':
-        #        break
-        next_line = self.next_line
-        if not next_line:
-            raise StopIteration
-
-        seq = Sequence()
-        seq.header = next_line.rstrip().replace('\n','').replace('\r','')
-
-        next_line = self.fasta_file.readline()
-        while next_line and next_line[0] != '>':
-            #tail = self.fasta_file.tell()
-            #line = self.fasta_file.readline()
-            #if not line:
-            #    break
-            #if line[0] == '>':
-            #    self.fasta_file.seek(tail)
-            #    break
-            seq.sequence_parts.append(next_line)
-            next_line = self.fasta_file.readline()
-        self.next_line = next_line
-        return seq
-
-    # Python 2/3 compat
-    next = __next__
-#==============================================================================
-
-def target_match(target, search_entry):
-    ''' Matches '''
-    search_entry = search_entry.upper()
-    for atarget in target:
-        if search_entry.find(atarget) > -1:
-            return atarget
-    return None
-
-
-def main():
-    ''' the main function'''
-    logging.basicConfig(filename='filter_fasta_log',
-        level=logging.INFO,
-        format='%(asctime)s :: %(levelname)s :: %(message)s',)
-
-    used_sequences = set()
-    work_summary = {'wanted': 0, 'found':0, 'duplicates':0}
-    targets = []
-
-    f_target = open(sys.argv[1])
-    for line in f_target.readlines():
-        targets.append(">%s" % line.strip().upper())
-    f_target.close()
-
-    logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.')
-
-    work_summary['wanted'] = len(targets)
-    homd_db = FASTAReader(sys.argv[2])
-
-    i = 0
-    output = open(sys.argv[3], "w")
-    try:
-        for entry in homd_db:
-            target_matched_results = target_match(targets, entry.header)
-            if target_matched_results:
-                work_summary['found'] += 1
-                targets.remove(target_matched_results)
-                sequence = entry.get_sequence()
-                if sequence in used_sequences:
-                    work_summary['duplicates'] += 1
-                else:
-                    used_sequences.add(sequence)
-                    print(entry.header, file=output)
-                    print(sequence, file=output)
-    finally:
-        output.close()
-
-    logging.info('Completed filtering')
-    for parm, count in work_summary.items():
-        logging.info('%s ==> %d', parm, count)
-
-if __name__ == "__main__":
-    main()
--- a/tools/filter_by_fasta_ids.xml	Fri Sep 26 14:23:16 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs">
-  <description>Extract sequences from a FASTA file based on a list of IDs</description>
-  <command interpreter="python">filter_by_fasta_ids.py $identifiers $input $output</command>
-  <inputs>
-    <param format="fasta" name="input" type="data" label="FASTA sequences"/>
-    <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
-  </inputs>
-  <outputs>
-    <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
-  </outputs>
-  <help>
-  </help>
-</tool>