diff filter.xml @ 15:aa0059118fb9 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit c21958f44b81d740191999fb6015d5ae69538ee0
author iuc
date Wed, 31 Jul 2024 18:10:52 +0000
parents d636ce5cde16
children
line wrap: on
line diff
--- a/filter.xml	Sat May 18 18:29:27 2024 +0000
+++ b/filter.xml	Wed Jul 31 18:10:52 2024 +0000
@@ -98,6 +98,41 @@
     random_state=$method.random_state,
     replace=$method.replace,
     copy=False)
+
+#else if $method.method == "filter_marker"
+
+#if $method.layer_selection.use_raw == 'False':
+    adata.X = adata.layers['$method.layer_selection.layer']
+#end if
+
+def check_marker(adata, group, gene, thresh_mean, thresh_frac, groupby):
+    filtered_data = adata[adata.obs[groupby] == group, adata.var_names == gene]
+    mean_expression = np.mean(filtered_data.X)
+    frac_cell_mean_expression = len(filtered_data.X[filtered_data.X > mean_expression]) / filtered_data.n_obs
+    if ( mean_expression > thresh_mean and frac_cell_mean_expression >= thresh_frac ):
+        return(True)
+    return(False)
+
+header='infer'
+
+#if $method.header == 'not_included':
+    header=None
+#end if
+
+marker_list={key: list(value.values()) for key, value in pd.read_csv('$method.markerfile', sep='\t', index_col=0, header=header).to_dict(orient='index').items()}
+
+for key, value in marker_list.items():
+    marker_list[key] = [x for x in value if check_marker(adata, key, x, $method.thresh_mean, $method.thresh_frac, '$method.groupby')]
+
+# Find the maximum length of lists
+max_len = max(len(lst) for lst in marker_list.values())
+
+# Fill smaller lists with empty values
+for key, value in marker_list.items():
+    marker_list[key] = value + [''] * (max_len - len(value))
+
+df = pd.DataFrame(marker_list).T
+df.to_csv('marker.tsv', sep='\t', index=True)
 #end if
 
 @CMD_anndata_write_outputs@
@@ -113,6 +148,7 @@
                 <option value="pp.highly_variable_genes">Annotate (and filter) highly variable genes, using 'pp.highly_variable_genes'</option>
                 <option value="pp.subsample">Subsample to a fraction of the number of observations, using 'pp.subsample'</option>
                 <option value="pp.downsample_counts">Downsample counts from count matrix, using 'pp.downsample_counts'</option>
+                <option value="filter_marker">Filter markers from count matrix and marker list</option>
             </param>
             <when value="pp.filter_cells">
                 <conditional name="filter">
@@ -213,11 +249,36 @@
                 <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
                 <param argument="replace" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Sample the counts with replacement?"/>
             </when>
+            <when value="filter_marker">
+                <param argument="markerfile" type="data" format="tabular" label="List of markers" help="This should be a tsv where row = group (e.g. celltypes) and columns = markers."></param>
+                <param name="header" type="select" label="Header in the list of markers?">
+                    <option value="included">Header incldued</option>
+                    <option value="not_included">Header not included</option>
+                </param>
+                <param argument="thresh_mean" type="float" min="0.0" value="1.0" label="Minimal average count of all cells of a group (e.g., celltype) for a particular marker" help="Increasing the threshold will result in a smaller marker set."/>
+                <param argument="thresh_frac" type="float" min="0.0" max="1.0" value="0.1" label="Minimal fractions of cells that has a higher count than the average count of all cells of the group for the marker" help="Increasing this threshold might remove marker outliers."/>
+                <conditional name="layer_selection">
+                    <param name="use_raw" type="select" label="Use .X of adata to perform the filtering" help="">
+                        <option value="True">Yes</option>
+                        <option value="False">No</option>
+                    </param>
+                    <when value="False">
+                        <param argument="layer" type="text" value="" label="Key from adata.layers whose value will be used to filter" help="If layers specified then use adata.layers[layer]."/>
+                    </when>
+                    <when value="True"/>
+                </conditional>
+                <param argument="groupby" type="text" value="" label="The key of the observation grouping to consider (e.g., celltype)" help="">
+                    <expand macro="sanitize_query" />
+                </param>
+            </when>
         </conditional>
         <expand macro="inputs_common_advanced"/>
     </inputs>
     <outputs>
         <expand macro="anndata_outputs"/>
+        <data name="marker_out" format="tabular" from_work_dir="marker.tsv" label="${tool.name} on ${on_string}: Markers">
+            <filter>method['method'] == 'filter_marker'</filter>
+        </data>
     </outputs>
     <tests>
         <test expect_num_outputs="2">
@@ -444,6 +505,32 @@
             </output>
             <output name="anndata_out" file="pp.downsample_counts.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="10000000" delta_frac="0.5"/>
         </test>
+        <test expect_num_outputs="3">
+            <!-- test 10 -->
+            <param name="adata" value="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" />
+            <conditional name="method">
+                <param name="method" value="filter_marker"/>
+                <param name="markerfile" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_1.tsv"/>
+                <param name="thresh_mean" value="1.0"/>
+                <param name="thresh_frac" value="0.2"/>
+                <param name="layer_selection" value="True"/>
+                <param name="groupby" value="bulk_labels"/>
+            </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true" />
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="adata, key, x, 1.0, 0.2, 'bulk_labels'"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs, var, uns" />
+                </assert_contents>
+            </output>
+            <output name="marker_out" file="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv" ftype="tabular" compare="sim_size"/>
+        </test>
     </tests>
     <help><![CDATA[
 
@@ -502,6 +589,16 @@
 Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This
 has been implemented by M. D. Luecken.
 
+
+Filter marker genes (`filter_marker`)
+======================================================================
+
+This option is specific for celltype marker gene detection. You can generate a celltype marker gene file (tsv) with **COSG** provided at Galaxy.
+
+The marker gene file should have as rows celltypes and columns as marker genes. Each celltype can have varying number of marker genes.
+
+A marker gene is returned (retained in the list) if the mean expression of the marker gene is bigger than the threshold of mean expression (thresh_mean) and if the fraction of cells with the marker gene expression is equal or higher than the cell fraction threshold (thresh_frac).
+
 More details on the `scanpy documentation
 <https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.downsample_counts.html>`__