changeset 0:fe0fd27aba50 draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
author ebi-gxa
date Thu, 07 Nov 2019 05:12:10 -0500
parents
children a2aaefceb418
files salmonKallistoMtxTo10x.py salmonKallistoMtxTo10x.xml
diffstat 2 files changed, 127 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/salmonKallistoMtxTo10x.py	Thu Nov 07 05:12:10 2019 -0500
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+# Alevin and Kallisto currently output MTX files and gene labels in a manner
+# inconsistent with the old-style 10X conventions. In both cases the matrix
+# must be transposed, and gene indentifier columns duplicated
+
+from __future__ import print_function
+from collections import defaultdict
+from struct import Struct
+import pandas as pd
+import gzip
+import sys
+import os
+from scipy.io import mmread,mmwrite
+from scipy.sparse import *
+from shutil import copyfile
+import pathlib
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser(description='Convert Alevin or Kallisto MTX outputs to 10X .mtx.')
+parser.add_argument('mtx', help = 'MTX-format matrix file')
+parser.add_argument('genes', help = 'Gene names text file')
+parser.add_argument('barcodes', help = 'Barcodes file')
+parser.add_argument('mtx_out', help = 'Output directory for converted results')
+parser.add_argument('--cell_prefix', dest='cell_prefix', default='', help = 'Prefix to apply to cell barcodes')
+args = parser.parse_args() 
+
+quant_file=args.mtx
+cb_file=args.barcodes
+gene_file=args.genes
+mtx_out=args.mtx_out
+cell_prefix=args.cell_prefix
+
+if not os.path.exists(quant_file):
+    print("quant file {} doesn't exist".format( quant_file ))
+    sys.exit(1)
+
+if not os.path.exists(cb_file):
+    print("cell barcodes file: {} doesn't exist".format( cb_file ))
+    sys.exit(1)
+
+if not os.path.exists(gene_file):
+    print("genes file: {} doesn't exist".format( gene_file))
+    sys.exit(1)
+
+# Read gene and cell labels, apply cell prefix
+
+cb_names = [cell_prefix + s for s in pd.read_csv(cb_file, header=None)[0].values]
+gene_names = pd.read_csv(gene_file, header=None)[0].values
+umi_counts = mmread( quant_file )
+    
+# Write outputs to a .mtx file readable by tools expecting 10X outputs.
+# Barcodes file works as-is, genes need to be two-column, duplicating the
+# identifiers. Matrix itself needs to have genes by row, so we transpose. 
+
+pathlib.Path(mtx_out).mkdir(parents=True, exist_ok=True)
+mmwrite('%s/matrix.mtx' % mtx_out, umi_counts.transpose()) 
+
+genes_frame = pd.DataFrame([ gene_names, gene_names]).transpose()
+genes_frame.to_csv(path_or_buf='%s/genes.tsv' % mtx_out, index=False, sep="\t", header = False)
+
+with open('%s/barcodes.tsv' % mtx_out, 'w') as f:
+    f.write("\n".join(cb_names))    
+    f.write("\n")    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/salmonKallistoMtxTo10x.xml	Thu Nov 07 05:12:10 2019 -0500
@@ -0,0 +1,62 @@
+<tool id="_salmon_kallisto_mtx_to_10x" name="salmonKallistoMtxTo10x" version="0.0.1+galaxy0">
+    <description>Transforms .mtx matrix and associated labels into a format compatible with tools expecting old-style 10X data</description>
+    <requirements>
+      <requirement type="package">scipy</requirement>
+      <requirement type="package">pandas</requirement>
+    </requirements>
+    <command interpreter="python" detect_errors="exit_code"><![CDATA[
+        $__tool_directory__/salmonKallistoMtxTo10x.py --cell_prefix "${cell_prefix}" "${mtx_file}" "${genes_file}" "${barcodes_file}" ./
+	    ]]></command>
+
+    <inputs>
+        <param name="mtx_file" type="data" format="txt" label=".mtx-format matrix" />
+        <param name="genes_file" type="data" format="txt" label="Tab-delimited genes file" />
+        <param name="barcodes_file" type="data" format="txt" label="Tab-delimited barcodes file" />
+        <param name="cell_prefix" type="text" optional='true' value="" label="Prefix to prepend to cell names / barcodes" help="This is useful when multiple matrices from different libraries with overlapping barcodes will be merged"/>
+    </inputs>
+
+    <outputs>
+        <data name="genes_out" format="txt" from_work_dir="genes.tsv" label="${tool.name} on ${on_string}: genes"/>
+        <data name="barcodes_out" format="txt" from_work_dir="barcodes.tsv" label="${tool.name} on ${on_string}: barcodes"/>
+        <data name="matrix_out" format="txt" from_work_dir="matrix.mtx" label="${tool.name} on ${on_string}: matrix" />
+    </outputs>
+    
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Kallisto and Alevin (and possibly other tools) output an MTX file and associated labels that are not consistent with the old-style 10X, meaning that routines designed to parse those files cannot be used. This tool transforms (in the mathematical sense) the matrix, and reformats the genes file (essentially duplicating the column) to match those earlier conventions.
+
+**Inputs**
+
+MTX and row and gene labels from the relevant tool. For Alevin this will be
+
+ * Matrix file: quants_mat.mtx.gz 
+ * Genes file: quants_mat_cols.txt
+ * Barcodes file: quants_mat_rows.txt
+
+For Kallisto it will be:
+
+ * Matrix file: [name].mtx.gz 
+ * Genes file: [name].genes.txt
+ * Barcodes file: [name].barcodes.txt
+
+-----
+
+**Outputs**
+
+ * MTX, gene and cell labels in 10X style
+]]></help>
+<citations>
+  <citation type="bibtex">
+@misc{github-hinxton-single-cell,
+author = {Jonathan Manning, EBI Gene Expression Team},
+year = {2019},
+title = {Hinxton Single Cell Anlysis Environment},
+publisher = {GitHub},
+journal = {GitHub repository},
+url = {https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary},
+  }</citation>
+</citations>
+</tool>