Mercurial > repos > greg > vsnp_sample_names

<tool id="vsnp_sample_names" name="vSNP: sample names" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
    <description></description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <command detect_errors="exit_code"><![CDATA[
#import difflib
#import os
#import re

#if $input_type_cond.input_type == 'single':
    #set read1 = $input_type_cond.read1
    #set sample_name = re.sub('[^\s\w\-\\.]', '_', str($read1.element_identifier))
#else if $input_type_cond.input_type == 'pair':
    #set read1 = $input_type_cond.read1
    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
    #set read2 = $input_type_cond.read2
    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
    #set matches = difflib.SequenceMatcher(None, read1_identifier, read2_identifier).get_matching_blocks()
    #set match = $matches[0]
    #set sample_name = re.sub('[^\s\w\-]', '_', str($read1_identifier[match.a:match.a + match.size]))
#else:
    #set read1 = $input_type_cond.reads_collection['forward']
    #set read1_filename = $read1.name
    #set sample_name = re.sub('[^\s\w\-]', '_', str($read1_filename))
#end if

#if $sample_name.find('_R1') >0:
    ## Something like CMC_20E1_R1.fastq.gz
    #set sample_name = $sample_name.split('_R1')[0]
#else if $sample_name.find(".") > 0:
    #if $read1.is_of_type('fastqsanger.gz') and $sample_name.endswith('gz'):
        ## Something like my_sample.fastq.gz
        #set sample_name = '.'.join($sample_name.split('.')[0:-2])
    #else:
        ## Something like my_sample.fastq
        #set sample_name = $os.path.splitext($sample_name)[0]
    #end if
#else if $sample_name.find("_") > 0:
    #if $read1.is_of_type('fastqsanger.gz') and $sample_name.endswith('gz'):
        ## Something like my_sample_fastq_gz
        #set sample_name = '_'.join($sample_name.split('_')[0:-2])
    #else:
        ## Something like my_sample_fastq
        #set sample_name = "_".join($sample_name.split("_")[0:-1])
    #end if
#end if

echo '$sample_name' > '$output'
]]></command>
    <inputs>
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
                <option value="single" selected="true">Single dataset</option>
                <option value="pair">Dataset pair</option>
                <option value="paired">List of dataset pairs</option>
            </param>
            <when value="single">
                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
            </when>
            <when value="pair">
                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
                <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
            </when>
            <when value="paired">
                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output" format="txt"/>
    </outputs>
    <tests>
        <!-- Single files -->
        <test>
            <param name="input_type" value="single"/>
            <param name="read1" value="CMC_20E1_R1.fastq.gz" dbkey="89"/>
            <output name="output" file="sample_names.txt" ftype="txt"/>
        </test>
        <!-- Paired reads in separate datasets -->
        <test>
            <param name="input_type" value="pair"/>
            <param name="read1" value="CMC_20E1_R1.fastq.gz" dbkey="89"/>
            <param name="read2" value="CMC_20E1_R2.fastq.gz" dbkey="89"/>
            <output name="output" file="sample_names.txt" ftype="txt"/>
        </test>
        <!-- Collection of Paired reads -->
        <test>
            <param name="input_type" value="paired"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" value="CMC_20E1_R1.fastq.gz"/>
                    <element name="reverse" value="CMC_20E1_R2.fastq.gz"/>
                </collection>
            </param>
            <output name="output" file="sample_names.txt" ftype="txt"/>
        </test>
        <!-- Collection of Paired reads -->
        <test>
            <param name="input_type" value="paired"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" value="SRR14085881_forward"/>
                    <element name="reverse" value="SRR14085881_reverse"/>
                </collection>
            </param>
            <output name="output" file="sample_names2.txt" ftype="txt"/>
        </test>
    </tests>
    <help>
**What it does**

Accepts fastqsanger sample files, extracts a unique portion of the file name as the sample name, and writes it to
the output.  The output text file can be consumed by the **Parse parameter value** expression tool to provide workflow
parameter values to the **Read group identifier (ID)** and the **Sample name identifier (SM)**  parameters in the
**Map with BWA-MEM** tool.
    </help>
    <expand macro="citations"/>
</tool>
author	greg
date	Fri, 03 Sep 2021 17:21:52 +0000
parents	fb3defef50e5
children	4f43f163c408