diff import.xml @ 13:d330b3082107 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/anndata/ commit 67b3808b56df343798263ff0c905df8cb789edfa
author iuc
date Sat, 14 Sep 2024 19:58:24 +0000
parents 93dd15e13e6a
children
line wrap: on
line diff
--- a/import.xml	Sun Nov 12 16:42:57 2023 +0000
+++ b/import.xml	Sat Sep 14 19:58:24 2024 +0000
@@ -1,9 +1,9 @@
-<tool id="anndata_import" name="Import Anndata and loom" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
-    <description>from different format</description>
+<tool id="anndata_import" name="Import Anndata" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>from different formats</description>
     <macros>
         <import>macros.xml</import>
         <xml name="params_10x">
-            <param name="barcodes" type="data" format="tabular,txt" label="Barcodes"/>
+            <param name="barcodes" type="data" format="tabular,txt" label="Barcodes (whitelist) file with one barcode per line"/>
             <param name="var_names" type="select" label="Variables index">
                 <option value="gene_symbols">gene_symbols</option>
                 <option value="gene_ids">gene_ids</option>
@@ -13,164 +13,132 @@
         </xml>
     </macros>
     <expand macro="requirements">
-        <requirement type="package" version="1.9.6">scanpy</requirement>
+        <requirement type="package" version="1.10.2">scanpy</requirement>
     </expand>
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
-#if $hd5_format.filetype == 'anndata'
-    #if $hd5_format.in.adata_format == 'mtx'
-        mkdir mtx
-        #if $hd5_format.in.tenx.use == 'legacy_10x'
-            && cp '$hd5_format.in.matrix' 'mtx/matrix.mtx'
-            && cp '$hd5_format.in.tenx.genes' 'mtx/genes.tsv'
-            && cp '$hd5_format.in.tenx.barcodes' 'mtx/barcodes.tsv'
-        #else if $hd5_format.in.tenx.use == 'v3_10x'
-            && cp '$hd5_format.in.matrix' 'mtx/matrix.mtx'
-            && gzip 'mtx/matrix.mtx'
-            && cp '$hd5_format.in.tenx.features' 'mtx/features.tsv'
-            && gzip 'mtx/features.tsv'
-            && cp '$hd5_format.in.tenx.barcodes' 'mtx/barcodes.tsv'
-            && gzip 'mtx/barcodes.tsv'
-        #end if
-        &&
-    #else if $hd5_format.in.adata_format == 'umi_tools'
-        ## avoid gzipping in the inputdir
-        gzip -c '$hd5_format.in.input' > umi_tools_input.gz
-        &&
+#if $in.adata_format == 'mtx'
+    mkdir mtx
+    #if $in.tenx.use == 'legacy_10x'
+        && cp '$in.matrix' 'mtx/matrix.mtx'
+        && cp '$in.tenx.genes' 'mtx/genes.tsv'
+        && cp '$in.tenx.barcodes' 'mtx/barcodes.tsv'
+    #else if $in.tenx.use == 'v3_10x'
+        && cp '$in.matrix' 'mtx/matrix.mtx'
+        && gzip 'mtx/matrix.mtx'
+        && cp '$in.tenx.features' 'mtx/features.tsv'
+        && gzip 'mtx/features.tsv'
+        && cp '$in.tenx.barcodes' 'mtx/barcodes.tsv'
+        && gzip 'mtx/barcodes.tsv'
     #end if
-
-    @CMD@
-
-    #if $hd5_format.in.adata_format == 'mtx'
-        && rm -rf mtx
-    #end if
+    &&
+#else if $in.adata_format == 'umi_tools'
+    ## avoid gzipping in the inputdir
+    gzip -c '$in.input' > umi_tools_input.gz
+    &&
+#end if
 
-#else:
-        python '$__tool_directory__/tsv_to_loompy.py'
-        -c '${hd5_format.coldata}'
-        -r '${hd5_format.rowdata}'
-        -f '${hd5_format.mainmatrix}'
-        #if $hd5_format.other_files:
-            '${hd5_format.other_files}'
-        #end if
-#end if
+@CMD@
       ]]></command>
     <configfiles>
         <configfile name="script_file"><![CDATA[
 @CMD_imports@
-#if $hd5_format.filetype == 'anndata'
-    #if $hd5_format.in.adata_format == 'loom'
+#if $in.adata_format == 'loom'
 adata = ad.read_loom(
-    '$hd5_format.in.input',
-    sparse=$hd5_format.in.sparse,
-    cleanup=$hd5_format.in.cleanup,
-    X_name='$hd5_format.in.x_name',
-    obs_names='$hd5_format.in.obs_names',
-    var_names='$hd5_format.in.var_names')
+    '$in.input',
+    sparse=$in.sparse,
+    cleanup=$in.cleanup,
+    X_name='$in.x_name',
+    obs_names='$in.obs_names',
+    var_names='$in.var_names')
 
-    #else if $hd5_format.in.adata_format == 'tabular'
-        #set delimiter=$hd5_format.in.input.metadata.delimiter
-        #if $delimiter != ','
-            #set delimiter='\\t'
-        #end if
+#else if $in.adata_format == 'tabular'
+    #set delimiter=$in.input.metadata.delimiter
+    #if $delimiter != ','
+        #set delimiter='\\t'
+    #end if
 adata = ad.read_csv(
-    '$hd5_format.in.input',
+    '$in.input',
     delimiter='$delimiter',
-    first_column_names=$hd5_format.in.first_column_names)
+    first_column_names=$in.first_column_names)
+## convert into sparse matrix. Dense matrix needs large disk space and higher access times
+from scipy.sparse import csr_matrix
+adata.X = csr_matrix(adata.X)
 
-    #else if $hd5_format.in.adata_format == '10x_h5'
+#else if $in.adata_format == '10x_h5'
 import scanpy as sc
-adata = sc.read_10x_h5('$hd5_format.in.input')
+adata = sc.read_10x_h5('$in.input')
 
-    #else if $hd5_format.in.adata_format == 'mtx'
-        #if $hd5_format.in.tenx.use == 'no'
-adata = ad.read_mtx(filename='$hd5_format.in.matrix')
-        #else
+#else if $in.adata_format == 'mtx'
+    #if $in.tenx.use == 'no'
+adata = ad.read_mtx(filename='$in.matrix')
+    #else
 import scanpy as sc
 adata = sc.read_10x_mtx(
     'mtx',
-    var_names='$hd5_format.in.tenx.var_names',
-    make_unique=$hd5_format.in.tenx.make_unique,
+    var_names='$in.tenx.var_names',
+    make_unique=$in.tenx.make_unique,
     cache=False,
-    gex_only=$hd5_format.in.tenx.gex_only)
-        #end if
+    gex_only=$in.tenx.gex_only)
+    #end if
 
-    #else if $hd5_format.in.adata_format == 'umi_tools'
+#else if $in.adata_format == 'umi_tools'
 adata = ad.read_umi_tools('umi_tools_input.gz')
-    #end if
-adata.write('anndata.h5ad')
 #end if
+adata.write('anndata.h5ad', compression='gzip')
+print(adata)
 ]]></configfile>
     </configfiles>
     <inputs>
-        <conditional name="hd5_format">
-            <param name="filetype" type="select" label="hd5 format to be created">
-                <option value="anndata" selected="true">Anndata file</option>
-                <option value="loom">Loom file</option>
+        <conditional name="in">
+            <param name="adata_format" type="select" label="Create annadata from">
+                <option value="loom">Loom data</option>
+                <option value="tabular">Tabular, CSV, TSV</option>
+                <option value="10x_h5">H5 format from Cell ranger or not</option>
+                <option value="mtx">Matrix Market (mtx), from Cell ranger or not</option>
+                <option value="umi_tools">UMI tools count matrix</option>
             </param>
-            <when value="anndata">
-                <conditional name="in">
-                    <param name="adata_format" type="select" label="Format for the annotated data matrix">
-                        <option value="loom">Loom</option>
-                        <option value="tabular">Tabular, CSV, TSV</option>
-                        <option value="10x_h5">H5 format from Cell ranger or not</option>
-                        <option value="mtx">Matrix Market (mtx), from Cell ranger or not</option>
-                        <option value="umi_tools">UMI tools</option>
+            <when value="loom">
+                <param name="input" type="data" format="loom" label="Annotated data matrix"/>
+                <param name="sparse" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Is the data matrix to read sparse?"/>
+                <param name="cleanup" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Cleanup?"/>
+                <param name="x_name" type="text" value="spliced" label="X_name"/>
+                <param name="obs_names" type="text" value="CellID" label="obs_names"/>
+                <param name="var_names" type="text" value="Gene" label="var_names"/>
+            </when>
+            <when value="tabular">
+                <param name="input" type="data" format="tabular,csv,tsv" label="Annotated data matrix"/>
+                <param name="first_column_names" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Does the first column store the row names?"/>
+            </when>
+            <when value="10x_h5" >
+                <param name="input" type="data" format="h5" label="Data matrix"/>
+            </when>
+            <when value="mtx">
+                <param name="matrix" type="data" format="mtx" label="Matrix in Matrix Market format"/>
+                <conditional name="tenx">
+                    <param name="use" type="select" label="Use 10x Genomics formatted mtx">
+                        <option value="no">No</option>
+                        <option value="legacy_10x">Output from Cell Ranger v2 or earlier versions</option>
+                        <option value="v3_10x">Output from Cell Ranger v3 or later versions</option>
                     </param>
-                    <when value="loom">
-                        <param name="input" type="data" format="loom" label="Annotated data matrix"/>
-                        <param name="sparse" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Is the data matrix to read sparse?"/>
-                        <param name="cleanup" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Cleanup?"/>
-                        <param name="x_name" type="text" value="spliced" label="X_name"/>
-                        <param name="obs_names" type="text" value="CellID" label="obs_names"/>
-                        <param name="var_names" type="text" value="Gene" label="var_names"/>
+                    <when value="no"/>
+                    <when value="legacy_10x">
+                        <param name="genes" type="data" format="tabular" label="2-column genes file with gene id and gene name"/>
+                        <expand macro="params_10x"/>
                     </when>
-                    <when value="tabular">
-                        <param name="input" type="data" format="tabular,csv,tsv" label="Annotated data matrix"/>
-                        <param name="first_column_names" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Does the first column store the row names?"/>
-                    </when>
-                    <when value="10x_h5" >
-                        <param name="input" type="data" format="h5" label="Data matrix"/>
-                    </when>
-                    <when value="mtx">
-                        <param name="matrix" type="data" format="mtx" label="Matrix"/>
-                        <conditional name="tenx">
-                            <param name="use" type="select" label="Use 10x Genomics formatted mtx">
-                                <option value="no">No</option>
-                                <option value="legacy_10x">Output from Cell Ranger v2 or earlier versions</option>
-                                <option value="v3_10x">Output from Cell Ranger v3 or later versions</option>
-                            </param>
-                            <when value="no"/>
-                            <when value="legacy_10x">
-                                <param name="genes" type="data" format="tabular" label="Genes"/>
-                                <expand macro="params_10x"/>
-                            </when>
-                            <when value="v3_10x">
-                                <param name="features" type="data" format="tabular" label="Features"/>
-                                <expand macro="params_10x"/>
-                            </when>
-                        </conditional>
-                    </when>
-                    <when value="umi_tools">
-                        <param name="input" type="data" format="tabular" label="condensed count matrix from UMI tools"/>
+                    <when value="v3_10x">
+                        <param name="features" type="data" format="tabular" label="3-column features file with feature id, feature name and feature type" help="For scRNA-seq, features are genes. The 3rd column typically contains the word 'Gene Expression'"/>
+                        <expand macro="params_10x"/>
                     </when>
                 </conditional>
             </when>
-            <when value="loom">
-                <param name="mainmatrix" type="data" format="tabular" label="File for main layer of loom file." help="All subsequent tsv must be the same dimensions as this file. When converted back to tsv using hd5 export, this will be labeled as 'mainmatrix.tsv'"/>
-                <param name="other_files" type="data" format="tabular" multiple="true" optional="true" label="Add layers" help="Adds layers of same dimension to the loom file. When converted to tsv using hd5 export, these layers will retain their names."/>
-                <param name="coldata" type="data" format="tabular" label="Tsv of column data." help="First row is column attributes, subsequent are values."/>
-                <param name="rowdata" type="data" format="tabular" label="Tsv of row data." help="First row is row attributes, subsequent are values."/>
+            <when value="umi_tools">
+                <param name="input" type="data" format="tabular" label="condensed count matrix from UMI tools"/>
             </when>
         </conditional>
     </inputs>
     <outputs>
-        <data name="anndata" format="h5ad" from_work_dir="anndata.h5ad" label="Anndata import on ${on_string}">
-            <filter>hd5_format['filetype'] == 'anndata'</filter>
-        </data>
-        <data name="loomdata" format="loom" from_work_dir="converted.loom" label="Loom import on ${on_string}">
-            <filter>hd5_format['filetype'] == 'loom'</filter>
-        </data>
+        <data name="anndata" format="h5ad" from_work_dir="anndata.h5ad" label="Anndata import on ${on_string}"/>
     </outputs>
     <tests>
         <test expect_num_outputs="1">
@@ -191,7 +159,13 @@
                 <has_text_matching expression="obs_names='CellID'"/>
                 <has_text_matching expression="var_names='Gene'"/>
             </assert_stdout>
-            <output name="anndata" value="import.loom.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="obs/obs_names"/>
+                    <has_h5_keys keys="var/var_names"/>
+                </assert_contents>
+            </output>
         </test>
         <test expect_num_outputs="1">
             <conditional name="in">
@@ -203,8 +177,8 @@
                 <has_text_matching expression="ad.read_csv"/>
                 <has_text_matching expression="delimiter=','"/>
                 <has_text_matching expression="first_column_names=True"/>
+                <has_text_matching expression="3 × 2"/>
             </assert_stdout>
-            <output name="anndata" value="import.csv.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
         <test expect_num_outputs="1">
             <conditional name="in">
@@ -216,8 +190,8 @@
                 <has_text_matching expression="ad.read_csv"/>
                 <has_text_matching expression="delimiter='\\t'"/>
                 <has_text_matching expression="first_column_names=True"/>
+                <has_text_matching expression="3 × 2"/>
             </assert_stdout>
-            <output name="anndata" value="import.tsv.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
         <test expect_num_outputs="1">
             <conditional name="in">
@@ -226,8 +200,10 @@
                 <conditional name="tenx">
                     <param name="use" value="no"/>
                 </conditional>
-                </conditional>
-            <output name="anndata" value="import.mtx.no_10x.h5ad" ftype="h5ad" compare="sim_size"/>
+            </conditional>
+            <assert_stdout>
+                <has_text_matching expression="343 × 12"/>
+            </assert_stdout>
         </test>
         <test expect_num_outputs="1">
             <conditional name="in">
@@ -242,7 +218,14 @@
                     <param name="gex_only" value="true"/>
                 </conditional>
             </conditional>
-            <output name="anndata" value="import.mtx.legacy_10x.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="12 × 343"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="var/gene_ids"/>
+                </assert_contents>
+            </output>
         </test>
         <test expect_num_outputs="1">
             <conditional name="in">
@@ -257,31 +240,29 @@
                     <param name="gex_only" value="true"/>
                 </conditional>
             </conditional>
-            <output name="anndata" value="import.mtx.v3_10x.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="1107 × 507"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="var/gene_ids"/>
+                    <has_h5_keys keys="var/feature_types"/>
+                </assert_contents>
+            </output>
         </test>
         <test expect_num_outputs="1">
-            <param name="filetype" value="anndata"/>
             <conditional name="in">
                 <param name="adata_format" value="umi_tools"/>
                 <param name="input" value="umi_tools.tsv"/>
             </conditional>
-            <output name="anndata" value="import.umi_tools.h5ad" ftype="h5ad" compare="sim_size"/>
-        </test>
-        <test expect_num_outputs="1">
-            <param name="filetype" value="loom"/>
-            <param name="mainmatrix" value="firstlayer.tsv"/>
-            <param name="other_files" value="secondlayer.tsv"/>
-            <param name="coldata" value="cols.tsv"/>
-            <param name="rowdata" value="rows.tsv"/>
-            <output name="loomdata" value="converted.loom.test" ftype="loom" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="2 × 13"/>
+            </assert_stdout>
         </test>
         <test expect_num_outputs="1"><!-- 10x h5 test -->
-            <conditional name="hd5_format">
-                <param name="filetype" value="anndata"/>
-                <conditional name="in">
-                    <param name="adata_format" value="10x_h5"/>
-                    <param name="input" value="dropletutils_input.h5"/>
-                </conditional>
+            <conditional name="in">
+                <param name="adata_format" value="10x_h5"/>
+                <param name="input" value="dropletutils_input.h5"/>
             </conditional>
             <output name="anndata">
                 <assert_contents>
@@ -296,7 +277,7 @@
 
 **What it does**
 
-This tool creates an AnnData or loom dataset from several input types:
+This tool creates an AnnData from several input types:
 
 - Loom (`read_loom method <https://anndata.readthedocs.io/en/latest/generated/anndata.read_loom.html>`__)
 - Tabular (`read_csv method <https://anndata.readthedocs.io/en/latest/generated/anndata.read_csv.html>`__)