Mercurial > repos > greg > sample_names

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,19 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <token name="@TOOL_VERSION@">1.0</token>
+    <token name="@VERSION_SUFFIX@">2</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                @misc{None,
+                journal = {None},
+                author = {1. Von Kuster G},
+                title = {Manuscript in preparation},
+                year = {None},
+                url = {https://github.com/USDA-VS/vSNP},}
+            </citation>
+        </citations>
+    </xml>
+</macros>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sample_names.xml	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,102 @@
+<tool id="sample_names" name="Extract sample names" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>from input file names</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <command detect_errors="exit_code"><![CDATA[
+#import difflib
+#import re
+
+#if $input_type_cond.input_type == 'single':
+    #set read1 = $input_type_cond.read1
+    #set sample_name = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
+#else if $input_type_cond.input_type == 'pair':
+    #set read1 = $input_type_cond.read1
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
+    #set read2 = $input_type_cond.read2
+    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+    #set matches = difflib.SequenceMatcher(None, read1_identifier, read2_identifier).get_matching_blocks()
+    #set match = $matches[0]
+    #set sample_name = re.sub('[^\s\w\-]', '_', str($read1_identifier[match.a:match.a + match.size]))
+#else:
+    #set read1_name = $input_type_cond.reads_collection['forward'].name
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1_name))
+    #set read2_name = $input_type_cond.reads_collection['reverse'].name
+    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2_name))
+    #set matches = difflib.SequenceMatcher(None, read1_identifier, read2_identifier).get_matching_blocks()
+    #set match = $matches[0]
+    #set sample_name = re.sub('[^\s\w\-]', '_', str($read1_identifier[match.a:match.a + match.size]))
+#end if
+
+echo '$sample_name' > '$output'
+]]></command>
+    <inputs>
+        <conditional name="input_type_cond">
+            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
+                <option value="single" selected="true">Single dataset</option>
+                <option value="pair">Dataset pair</option>
+                <option value="paired">List of dataset pairs</option>
+            </param>
+            <when value="single">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+            </when>
+            <when value="pair">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
+            </when>
+            <when value="paired">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output" format="txt"/>
+    </outputs>
+    <tests>
+        <!-- Single files -->
+        <test>
+            <param name="input_type" value="single"/>
+            <param name="read1" value="CMC_20E1_R1.fastq.gz" dbkey="89"/>
+            <output name="output" file="sample_names.txt" ftype="txt"/>
+        </test>
+        <!-- Paired reads in separate datasets -->
+        <test>
+            <param name="input_type" value="pair"/>
+            <param name="read1" value="CMC_20E1_R1.fastq.gz" dbkey="89"/>
+            <param name="read2" value="CMC_20E1_R2.fastq.gz" dbkey="89"/>
+            <output name="output" file="sample_names2.txt" ftype="txt"/>
+        </test>
+        <!-- Collection of Paired reads -->
+        <test>
+            <param name="input_type" value="paired"/>
+            <param name="reads_collection">
+                <collection type="paired">
+                    <element name="forward" value="CMC_20E1_R1.fastq.gz"/>
+                    <element name="reverse" value="CMC_20E1_R2.fastq.gz"/>
+                </collection>
+            </param>
+            <output name="output" file="sample_names3.txt" ftype="txt"/>
+        </test>
+        <!-- Collection of Paired reads -->
+        <test>
+            <param name="input_type" value="paired"/>
+            <param name="reads_collection">
+                <collection type="paired">
+                    <element name="forward" value="SRR14085881_forward"/>
+                    <element name="reverse" value="SRR14085881_reverse"/>
+                </collection>
+            </param>
+            <output name="output" file="sample_names4.txt" ftype="txt"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Accepts fastqsanger sample files, extracts a unique portion of the file name as the sample name, and writes it to
+the output.  The output text file can be consumed by the **Parse parameter value** expression tool to provide workflow
+parameter values to the **Read group identifier (ID)** and the **Sample name identifier (SM)**  parameters in the
+**Map with BWA-MEM** tool.
+    </help>
+    <expand macro="citations"/>
+</tool>
+
Binary file test-data/SRR14085881_forward has changed
Binary file test-data/SRR14085881_reverse has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_names.txt	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,1 @@
+CMC_20E1_R1_fastq_gz
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_names2.txt	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,1 @@
+CMC_20E1_R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_names3.txt	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,1 @@
+CMC_20E1_R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_names4.txt	Mon Nov 22 00:30:39 2021 +0000
@@ -0,0 +1,1 @@
+SRR14085881_