# HG changeset patch # User ebi-gxa # Date 1573121530 18000 # Node ID fe0fd27aba50c3caeb02d9d4683652adab8a08e3 planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5 diff -r 000000000000 -r fe0fd27aba50 salmonKallistoMtxTo10x.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/salmonKallistoMtxTo10x.py Thu Nov 07 05:12:10 2019 -0500 @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +# Alevin and Kallisto currently output MTX files and gene labels in a manner +# inconsistent with the old-style 10X conventions. In both cases the matrix +# must be transposed, and gene indentifier columns duplicated + +from __future__ import print_function +from collections import defaultdict +from struct import Struct +import pandas as pd +import gzip +import sys +import os +from scipy.io import mmread,mmwrite +from scipy.sparse import * +from shutil import copyfile +import pathlib +import numpy as np +import argparse + +parser = argparse.ArgumentParser(description='Convert Alevin or Kallisto MTX outputs to 10X .mtx.') +parser.add_argument('mtx', help = 'MTX-format matrix file') +parser.add_argument('genes', help = 'Gene names text file') +parser.add_argument('barcodes', help = 'Barcodes file') +parser.add_argument('mtx_out', help = 'Output directory for converted results') +parser.add_argument('--cell_prefix', dest='cell_prefix', default='', help = 'Prefix to apply to cell barcodes') +args = parser.parse_args() + +quant_file=args.mtx +cb_file=args.barcodes +gene_file=args.genes +mtx_out=args.mtx_out +cell_prefix=args.cell_prefix + +if not os.path.exists(quant_file): + print("quant file {} doesn't exist".format( quant_file )) + sys.exit(1) + +if not os.path.exists(cb_file): + print("cell barcodes file: {} doesn't exist".format( cb_file )) + sys.exit(1) + +if not os.path.exists(gene_file): + print("genes file: {} doesn't exist".format( gene_file)) + sys.exit(1) + +# Read gene and cell labels, apply cell prefix + +cb_names = [cell_prefix + s for s in pd.read_csv(cb_file, header=None)[0].values] +gene_names = pd.read_csv(gene_file, header=None)[0].values +umi_counts = mmread( quant_file ) + +# Write outputs to a .mtx file readable by tools expecting 10X outputs. +# Barcodes file works as-is, genes need to be two-column, duplicating the +# identifiers. Matrix itself needs to have genes by row, so we transpose. + +pathlib.Path(mtx_out).mkdir(parents=True, exist_ok=True) +mmwrite('%s/matrix.mtx' % mtx_out, umi_counts.transpose()) + +genes_frame = pd.DataFrame([ gene_names, gene_names]).transpose() +genes_frame.to_csv(path_or_buf='%s/genes.tsv' % mtx_out, index=False, sep="\t", header = False) + +with open('%s/barcodes.tsv' % mtx_out, 'w') as f: + f.write("\n".join(cb_names)) + f.write("\n") diff -r 000000000000 -r fe0fd27aba50 salmonKallistoMtxTo10x.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/salmonKallistoMtxTo10x.xml Thu Nov 07 05:12:10 2019 -0500 @@ -0,0 +1,62 @@ + + Transforms .mtx matrix and associated labels into a format compatible with tools expecting old-style 10X data + + scipy + pandas + + + + + + + + + + + + + + + + + + + +@misc{github-hinxton-single-cell, +author = {Jonathan Manning, EBI Gene Expression Team}, +year = {2019}, +title = {Hinxton Single Cell Anlysis Environment}, +publisher = {GitHub}, +journal = {GitHub repository}, +url = {https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary}, + } + +