Mercurial > repos > ebi-gxa > salmon_kallisto_mtx_to_10x
comparison salmonKallistoMtxTo10x.py @ 0:fe0fd27aba50 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
author | ebi-gxa |
---|---|
date | Thu, 07 Nov 2019 05:12:10 -0500 |
parents | |
children | 60fa6080f86f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fe0fd27aba50 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Alevin and Kallisto currently output MTX files and gene labels in a manner | |
4 # inconsistent with the old-style 10X conventions. In both cases the matrix | |
5 # must be transposed, and gene indentifier columns duplicated | |
6 | |
7 from __future__ import print_function | |
8 from collections import defaultdict | |
9 from struct import Struct | |
10 import pandas as pd | |
11 import gzip | |
12 import sys | |
13 import os | |
14 from scipy.io import mmread,mmwrite | |
15 from scipy.sparse import * | |
16 from shutil import copyfile | |
17 import pathlib | |
18 import numpy as np | |
19 import argparse | |
20 | |
21 parser = argparse.ArgumentParser(description='Convert Alevin or Kallisto MTX outputs to 10X .mtx.') | |
22 parser.add_argument('mtx', help = 'MTX-format matrix file') | |
23 parser.add_argument('genes', help = 'Gene names text file') | |
24 parser.add_argument('barcodes', help = 'Barcodes file') | |
25 parser.add_argument('mtx_out', help = 'Output directory for converted results') | |
26 parser.add_argument('--cell_prefix', dest='cell_prefix', default='', help = 'Prefix to apply to cell barcodes') | |
27 args = parser.parse_args() | |
28 | |
29 quant_file=args.mtx | |
30 cb_file=args.barcodes | |
31 gene_file=args.genes | |
32 mtx_out=args.mtx_out | |
33 cell_prefix=args.cell_prefix | |
34 | |
35 if not os.path.exists(quant_file): | |
36 print("quant file {} doesn't exist".format( quant_file )) | |
37 sys.exit(1) | |
38 | |
39 if not os.path.exists(cb_file): | |
40 print("cell barcodes file: {} doesn't exist".format( cb_file )) | |
41 sys.exit(1) | |
42 | |
43 if not os.path.exists(gene_file): | |
44 print("genes file: {} doesn't exist".format( gene_file)) | |
45 sys.exit(1) | |
46 | |
47 # Read gene and cell labels, apply cell prefix | |
48 | |
49 cb_names = [cell_prefix + s for s in pd.read_csv(cb_file, header=None)[0].values] | |
50 gene_names = pd.read_csv(gene_file, header=None)[0].values | |
51 umi_counts = mmread( quant_file ) | |
52 | |
53 # Write outputs to a .mtx file readable by tools expecting 10X outputs. | |
54 # Barcodes file works as-is, genes need to be two-column, duplicating the | |
55 # identifiers. Matrix itself needs to have genes by row, so we transpose. | |
56 | |
57 pathlib.Path(mtx_out).mkdir(parents=True, exist_ok=True) | |
58 mmwrite('%s/matrix.mtx' % mtx_out, umi_counts.transpose()) | |
59 | |
60 genes_frame = pd.DataFrame([ gene_names, gene_names]).transpose() | |
61 genes_frame.to_csv(path_or_buf='%s/genes.tsv' % mtx_out, index=False, sep="\t", header = False) | |
62 | |
63 with open('%s/barcodes.tsv' % mtx_out, 'w') as f: | |
64 f.write("\n".join(cb_names)) | |
65 f.write("\n") |