Mercurial > repos > iuc > anndata_manipulate

--- a/macros.xml	Sun Nov 12 16:42:25 2023 +0000
+++ b/macros.xml	Sat Sep 14 19:58:00 2024 +0000
@@ -1,6 +1,7 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.10.3</token>
+    <token name="@TOOL_VERSION@">0.10.9</token>
     <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.09</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">anndata</requirement>
@@ -21,12 +22,6 @@
 python '$script_file'
     ]]>
     </token>
-    <token name="@LOOMCMD@"><![CDATA[
-mkdir ./output &&
-mkdir ./attributes &&
-python '$__tool_directory__/loompy_to_tsv.py' -f '${hd5_format.input}'
-    ]]>
-    </token>
     <token name="@CMD_imports@"><![CDATA[
 import anndata as ad
     ]]>
@@ -76,4 +71,11 @@
             </when>
         </conditional>
     </xml>
+    <xml name="sanitize_query" token_validinitial="string.printable">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <remove value="&apos;" />
+            </valid>
+       </sanitizer>
+    </xml>
 </macros>
--- a/manipulate.xml	Sun Nov 12 16:42:25 2023 +0000
+++ b/manipulate.xml	Sat Sep 14 19:58:00 2024 +0000
@@ -1,4 +1,4 @@
-<tool id="anndata_manipulate" name="Manipulate AnnData" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+<tool id="anndata_manipulate" name="Manipulate AnnData" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>object</description>
     <macros>
         <import>macros.xml</import>
@@ -50,6 +50,27 @@
     key='$manipulate.key',
     categories=$categories)

+#else if $manipulate.function == 'remove_keys'
+    #if $manipulate.obs_keys
+        #set $keys = [x.strip() for x in str($manipulate.obs_keys).split(',')]
+adata.obs = adata.obs.drop(columns=$keys)
+    #end if
+
+    #if $manipulate.var_keys
+        #set $keys = [x.strip() for x in str($manipulate.var_keys).split(',')]
+adata.var = adata.vars.drop(columns=$keys)
+    #end if
+
+#else if $manipulate.function == 'flag_genes'
+## adapted from anndata operations
+    #for $flag in $manipulate.gene_flags
+k_cat = adata.var_names.str.startswith('${flag.startswith}')
+if k_cat.sum() > 0:
+    adata.var['${flag.col_name}'] = k_cat
+else:
+    print(f'No genes starting with {'${flag.startswith}'} found.')
+#end for
+
 #else if $manipulate.function == 'strings_to_categoricals'
 adata.strings_to_categoricals()

@@ -71,6 +92,14 @@
 adata.obs = obs
     #end if

+#else if $manipulate.function == 'split_on_obs'
+import os
+res_dir = "output_split"
+os.makedirs(res_dir, exist_ok=True)
+for s,field_value in enumerate(adata.obs["${manipulate.key}"].unique()):
+    ad_s = adata[adata.obs.${manipulate.key} == field_value]
+    ad_s.write(f"{res_dir}/${manipulate.key}_{s}.h5ad", compression='gzip')
+
 #else if $manipulate.function == 'filter'
     #if $manipulate.filter.filter == 'key'
         #if $manipulate.var_obs == 'var'
@@ -126,7 +155,11 @@

 #end if

-adata.write('anndata.h5ad')
+#if $manipulate.function != 'split_on_obs'
+adata.write('anndata.h5ad', compression='gzip')
+print(adata)
+#end if
+
 ]]></configfile>
     </configfiles>
     <inputs>
@@ -137,9 +170,12 @@
                 <option value="obs_names_make_unique">Makes the obs index unique by appending '1', '2', etc</option>
                 <option value="var_names_make_unique">Makes the var index unique by appending '1', '2', etc</option>
                 <option value="rename_categories">Rename categories of annotation</option>
+                <option value="remove_keys">Remove keys from obs or var annotations</option>
+                <option value="flag_genes">Flag genes start with a pattern</option><!--adapted from EBI anndata operations tool -->
                 <option value="strings_to_categoricals">Transform string annotations to categoricals</option>
                 <option value="transpose">Transpose the data matrix, leaving observations and variables interchanged</option>
                 <option value="add_annotation">Add new annotation(s) for observations or variables</option>
+                <option value="split_on_obs">Split the AnnData object into multiple AnnData objects based on the values of a given obs key</option><!--adapted from EBI anndata operations tool-->
                 <option value="filter">Filter observations or variables</option>
                 <option value="save_raw">Freeze the current state into the 'raw' attribute</option>
             </param>
@@ -167,6 +203,26 @@
                 <param name="key" type="text" value="" label="Key for observations or variables annotation" help="Annotation key in obs or var"/>
                 <param name="categories" type="text" value="" label="Comma-separated list of new categories" help="It should be the same number as the old categories"/>
             </when>
+            <when value="remove_keys">
+                <param name="obs_keys" type="text" value="" optional="true" label="Keys/fields to remove from observations (obs)">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param name="var_keys" type="text" value="" optional="true" label="Keys/fields to remove from variables (var)">
+                    <expand macro="sanitize_query"/>
+                </param>
+            </when>
+            <when value="flag_genes">
+                <repeat name="gene_flags" title="Flag genes that start with these names">
+                    <param name="startswith" type="text" label="Text that you expect the genes to be flagged to start with" help="For example, 'MT-' for mito genes">
+                        <sanitizer invalid_char="">
+                            <valid initial="string.ascii_letters,string.digits,string.punctuation">
+                                <remove value="&apos;" />
+                            </valid>
+                        </sanitizer>
+                    </param>
+                    <param name="col_name" type="text" label="Name of the column in var.names where this boolean flag is stored" help="For example, name this column as 'mito' for mitochondrial genes."/>
+                </repeat>
+            </when>
             <when value="strings_to_categoricals" ></when>
             <when value="transpose" ></when>
             <when value="add_annotation">
@@ -177,6 +233,15 @@
                 <param name="new_annot" type="data" format="tabular" label="Table with new annotations"
                     help="The new table should have the same number of rows and same order than obs or var. The key names should be in the header (1st line)"/>
             </when>
+            <when value="split_on_obs">
+                <param name="key" type="text" label="The obs key to split on" help="For example, if you want to split on cluster annotation, you can use the key 'louvain'. The output will be a collection of anndata objects">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.ascii_letters,string.digits,string.punctuation">
+                            <remove value="&apos;" />
+                        </valid>
+                    </sanitizer>
+                </param>
+            </when>
             <when value="filter">
                 <param name="var_obs" type="select" label="What to filter?">
                     <option value="var">Variables (var)</option>
@@ -237,10 +302,16 @@
         </conditional>
     </inputs>
     <outputs>
-        <data name="anndata" format="h5ad" from_work_dir="anndata.h5ad" label="${tool.name} (${manipulate.function}) on ${on_string}"/>
+        <data name="anndata" format="h5ad" from_work_dir="anndata.h5ad" label="${tool.name} (${manipulate.function}) on ${on_string}">
+            <filter>manipulate['function'] != 'split_on_obs'</filter>
+        </data>
+        <collection name="output_h5ad_split" type="list" label="${tool.name} (${manipulate.function}) on ${on_string} Collection">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.h5" directory="output_split" format="h5ad" visible="true"/>
+            <filter>manipulate['function'] == 'split_on_obs'</filter>
+        </collection>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 1 -->
             <param name="input" value="import.csv.h5ad"/>
             <conditional name="manipulate">
@@ -256,10 +327,15 @@
                 <has_text_matching expression="join='inner'"/>
                 <has_text_matching expression="index_unique='-'"/>
                 <has_text_matching expression="batch_key='batch'"/>
+                <has_text_matching expression="6 × 2"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.concatenate.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/batch"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 2 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -268,10 +344,17 @@
             </conditional>
             <assert_stdout>
                 <has_text_matching expression="adata.obs_names_make_unique\(join='-'\)"/>
+                <has_text_matching expression="500 × 11"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.obs_names_make_unique.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 3 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -280,25 +363,39 @@
             </conditional>
             <assert_stdout>
                 <has_text_matching expression="adata.var_names_make_unique\(join='-'\)"/>
+                <has_text_matching expression="500 × 11"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.var_names_make_unique.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 4 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
                 <param name="function" value="rename_categories"/>
                 <param name="key" value="cell_type"/>
-                <param name="categories" value="Ery, Mk, Mo, progenitor"/>
+                <param name="categories" value="ery, mk, mo, progenitor"/>
             </conditional>
             <assert_stdout>
                 <has_text_matching expression="adata.rename_categories"/>
                 <has_text_matching expression="key='cell_type'"/>
-                <has_text_matching expression="categories=\['Ery', 'Mk', 'Mo', 'progenitor'\]"/>
+                <has_text_matching expression="categories=\['ery', 'mk', 'mo', 'progenitor'\]"/>
+                <has_text_matching expression="500 × 11"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.rename_categories.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 5 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -306,10 +403,17 @@
             </conditional>
             <assert_stdout>
                 <has_text_matching expression="adata.strings_to_categoricals"/>
+                <has_text_matching expression="500 × 11"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.strings_to_categoricals.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 6 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -317,10 +421,17 @@
             </conditional>
             <assert_stdout>
                 <has_text_matching expression="adata.transpose"/>
+                <has_text_matching expression="11 × 500"/>
             </assert_stdout>
-            <output name="anndata" value="manipulate.transpose.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="var/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 7 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -328,9 +439,20 @@
                 <param name="var_obs" value="var"/>
                 <param name="new_annot" value="var_add_annotation.tabular"/>
             </conditional>
-            <output name="anndata" value="manipulate.add_annotation_var.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="500 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="var/annot1"/>
+                    <has_h5_keys keys="var/annot2"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 8 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -338,9 +460,20 @@
                 <param name="var_obs" value="obs"/>
                 <param name="new_annot" value="obs_add_annotation.tabular"/>
             </conditional>
-            <output name="anndata" value="manipulate.add_annotation_obs.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="500 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="obs/annot1"/>
+                    <has_h5_keys keys="obs/annot2"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 9 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -354,9 +487,18 @@
                     </conditional>
                 </conditional>
             </conditional>
-            <output name="anndata" value="manipulate.filter_var_index.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="500 × 2"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 10 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
@@ -372,23 +514,120 @@
                     </conditional>
                 </conditional>
             </conditional>
-            <output name="anndata" value="manipulate.filter_obs_key.h5ad" ftype="h5ad" compare="sim_size"/>
+            <assert_stdout>
+                <has_text_matching expression="260 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <!-- test 11 -->
             <param name="input" value="krumsiek11.h5ad"/>
             <conditional name="manipulate">
                 <param name="function" value="save_raw"/>
             </conditional>
-            <output name="anndata" value="manipulate.save_raw.h5ad" ftype="h5ad" compare="sim_size" delta="20000" />
+            <assert_stdout>
+                <has_text_matching expression="500 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <!-- test 12 remove_keys -->
+            <param name="input" value="krumsiek11.h5ad"/>
+            <conditional name="manipulate">
+                <param name="function" value="remove_keys"/>
+                <param name="obs_keys" value="cell_type"/>
+            </conditional>
+            <assert_stdout>
+                <has_text_matching expression="500 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="uns/highlights"/>
+                    <has_h5_keys keys="uns/iroot"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <!-- test 13 flag_genes -->
+            <param name="input" value="krumsiek11.h5ad"/>
+            <conditional name="manipulate">
+                <param name="function" value="flag_genes"/>
+                <repeat name="gene_flags">
+                    <param name="startswith" value="Gata"/>
+                    <param name="col_name" value="Gata_TF"/>
+                </repeat>
+                <repeat name="gene_flags">
+                    <param name="startswith" value="Gf"/>
+                    <param name="col_name" value="GF"/>
+                </repeat>
+            </conditional>
+            <assert_stdout>
+                <has_text_matching expression="500 × 11"/>
+            </assert_stdout>
+            <output name="anndata" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="var/Gata_TF"/>
+                    <has_h5_keys keys="var/GF"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <!-- test 14 split_on_obs -->
+            <param name="input" value="krumsiek11.h5ad"/>
+            <conditional name="manipulate">
+                <param name="function" value="split_on_obs"/>
+                <param name="key" value="cell_type"/>
+            </conditional>
+            <output_collection name="output_h5ad_split" type="list">
+                <element name="cell_type_0">
+                    <assert_contents>
+                        <has_h5_keys keys="obs/cell_type"/>
+                        <has_h5_keys keys="uns/highlights"/>
+                        <has_h5_keys keys="uns/iroot"/>
+                    </assert_contents>
+                </element>
+                <element name="cell_type_1">
+                    <assert_contents>
+                        <has_h5_keys keys="obs/cell_type"/>
+                        <has_h5_keys keys="uns/highlights"/>
+                        <has_h5_keys keys="uns/iroot"/>
+                    </assert_contents>
+                </element>
+                <element name="cell_type_2">
+                    <assert_contents>
+                        <has_h5_keys keys="obs/cell_type"/>
+                        <has_h5_keys keys="uns/highlights"/>
+                        <has_h5_keys keys="uns/iroot"/>
+                    </assert_contents>
+                </element>
+                <element name="cell_type_3">
+                    <assert_contents>
+                        <has_h5_keys keys="obs/cell_type"/>
+                        <has_h5_keys keys="uns/highlights"/>
+                        <has_h5_keys keys="uns/iroot"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
         </test>
     </tests>
     <help><![CDATA[
 **What it does**

-This tool takes a AnnData dataset, manipulates it and returns it.
+This tool takes a AnnData dataset, manipulates it and returns it.

-The possible manipulations are:
+The possible manipulations are:

 - Concatenate along the observations axis (`concatenate method <https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.concatenate.html>`__)

@@ -408,6 +647,14 @@

     Besides calling `self.obs[key].cat.categories = categories` - similar for `var` - this also renames categories in unstructured annotation that uses the categorical annotation `key`

+- Remove keys from obs or var annotations
+
+    Helps in cleaning up andata with many annotations. For example, helps in removing qc metrics calculated during the preprocesing or already existing cluster annotations.
+
+- Flag genes start with a pattern
+
+    Useful for flagging the mitochoncdrial or ribosomal protein genes
+
 - Transform string annotations to categoricals (`strings_to_categoricals method <https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.strings_to_categoricals.html>`__)

     Only affects string annotations that lead to less categories than the total number of observations.
@@ -416,7 +663,11 @@

     Data matrix is transposed, observations and variables are interchanged.

-- Add annotation for variables or observations
+- Add annotation for variables or
+
+- Split the AnnData object into multiple AnnData objects based on the values of a given obs key
+
+    For example, helps in splitting an anndata objects based on cluster annotation. This function generates a collection with number of elements equal to the number of categories in the input obs key.

 - Filter data variables or observations, by index or key
Binary file test-data/addloomout1.loom has changed
Binary file test-data/addloomout3.loom has changed
Binary file test-data/converted.loom.test has changed
Binary file test-data/export.krumsiek11.loom has changed
Binary file test-data/import.csv.h5ad has changed
Binary file test-data/import.loom.krumsiek11.h5ad has changed
Binary file test-data/import.mtx.legacy_10x.h5ad has changed
Binary file test-data/import.mtx.no_10x.h5ad has changed
Binary file test-data/import.mtx.v3_10x.h5ad has changed
Binary file test-data/import.tsv.h5ad has changed
Binary file test-data/import.umi_tools.h5ad has changed
Binary file test-data/krumsiek11.h5ad has changed
Binary file test-data/manipulate.add_annotation_obs.h5ad has changed
Binary file test-data/manipulate.add_annotation_var.h5ad has changed
Binary file test-data/manipulate.concatenate.h5ad has changed
Binary file test-data/manipulate.filter_obs_key.h5ad has changed
Binary file test-data/manipulate.filter_var_index.h5ad has changed
Binary file test-data/manipulate.obs_names_make_unique.h5ad has changed
Binary file test-data/manipulate.rename_categories.h5ad has changed
Binary file test-data/manipulate.save_raw.h5ad has changed
Binary file test-data/manipulate.strings_to_categoricals.h5ad has changed
Binary file test-data/manipulate.transpose.h5ad has changed
Binary file test-data/manipulate.var_names_make_unique.h5ad has changed
Binary file test-data/pp.neighbors_umap_euclidean.recipe_weinreb17.paul15_subsample.h5ad has changed
Binary file test-data/pp.pca.krumsiek11.h5ad has changed
Binary file test-data/tl.diffmap.h5ad has changed
Binary file test-data/tl.draw_graph.h5ad has changed
Binary file test-data/tl.paga.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad has changed
Binary file test-data/tl.rank_genes_groups.krumsiek11.h5ad has changed
Binary file test-data/tl.tsne.h5ad has changed
Binary file test-data/tl.umap.h5ad has changed