Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 79:98e3fecedd2b (2020-09-01)
Previous changeset 78:aff3ba86ef7a (2020-08-31) Next changeset 80:a4617f1d1d89 (2021-02-19)
Commit message:
Uploaded
modified:
change_o/define_clones.sh
change_o/makedb.sh
removed:
change_o/DefineClones.py
change_o/MakeDb.py
b
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/DefineClones.py
--- a/change_o/DefineClones.py Mon Aug 31 11:20:08 2020 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,739 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Assign Ig sequences into clones\n-"""\n-\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import sys\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from itertools import chain\n-from textwrap import dedent\n-from time import time\n-from Bio.Seq import translate\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.IO import printLog, printProgress, printCount, printWarning, printError\n-from presto.Multiprocessing import manageProcesses\n-from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field\n-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n-from changeo.Distance import distance_models, calcDistances, formClusters\n-from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \\\n-                       AIRRWriter, ChangeoWriter\n-from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue\n-\n-# Defaults\n-default_translate = False\n-default_distance = 0.0\n-default_index_mode = \'gene\'\n-default_index_action = \'set\'\n-default_distance_model = \'ham\'\n-default_norm = \'len\'\n-default_sym = \'avg\'\n-default_linkage = \'single\'\n-default_max_missing=0\n-choices_distance_model = (\'ham\', \'aa\', \'hh_s1f\', \'hh_s5f\',\n-                          \'mk_rs1nf\', \'mk_rs5nf\',\n-                          \'hs1f_compat\', \'m1n_compat\')\n-\n-\n-def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,\n-                  j_field=default_j_field, max_missing=default_max_missing):\n-    """\n-    Splits a set of sequence into passed and failed groups based on the number\n-    of missing characters in the sequence\n-\n-    Arguments:\n-        data : changeo.Multiprocessing.DbData object.\n-        seq_field : sequence field to filter on.\n-        v_field : field containing the V call.\n-        j_field : field containing the J call.\n-        max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.\n-\n-    Returns:\n-        changeo.Multiprocessing.DbResult : objected containing filtered records.\n-    """\n-    # Function to validate the sequence string\n-    def _pass(seq):\n-        if len(seq) > 0 and len(re.findall(r\'[^ACGT]\', seq)) <= max_missing:\n-            return True\n-        else:\n-            return False\n-\n-    # Define result object for iteration and get data records\n-    result = DbResult(data.id, data.data)\n-\n-    if not data:\n-        result.data_pass = []\n-        result.data_fail = data.data\n-        return result\n-\n-    result.data_pass = []\n-    result.data_fail = []\n-    for rec in data.data:\n-        seq = rec.getField(seq_field)\n-        if _pass(seq):  result.data_pass.append(rec)\n-        else:  result.data_fail.append(rec)\n-\n-    # Add V(D)J to log\n-    result.log[\'ID\'] = \',\'.join([str(x) for x in data.id])\n-    result.log[\'VCALL\'] = \',\'.join(set([(r.getVAllele(field=v_field) or \'\') for r in data.data]))\n-    result.log[\'JCALL\'] = \',\'.join(set([(r.getJAllele(field=j_field) or \'\') for r in data.data]))\n-    result.log[\'JUNCLEN\'] = \',\'.join(set([(str(len(r.junction)) or \'0\') for r in data.data]))\n-    result.log[\'CLONED\'] = len(result.data_pass)\n-    result.log[\'FILTERED\'] = len(result.data_fail)\n-\n-    return result\n-\n-\n-def indexByIdentity(index, key, rec, group_fields=None):\n-    """\n-    Updates a preclone index with a simple key\n-\n-    Arguments:\n-      index : preclone index from groupByGene\n-      key : index key\n-      rec : Receptor to add to the index\n-      group_fields : additional annotation fields to use to group preclones;\n-                     if None use only V, J and junction length\n-\n-    Returns:\n-      None : Updates index with new key and records.\n-    """\n-    index.setdefault(tuple(key), []).append(rec)\n-\n-\n-def i'..b'.add_argument(\'--norm\', action=\'store\', dest=\'norm\',\n-                        choices=(\'len\', \'mut\', \'none\'), default=default_norm,\n-                        help=\'\'\'Specifies how to normalize distances. One of none\n-                             (do not normalize), len (normalize by length),\n-                             or mut (normalize by number of mutations between sequences).\'\'\')\n-    group.add_argument(\'--sym\', action=\'store\', dest=\'sym\',\n-                        choices=(\'avg\', \'min\'), default=default_sym,\n-                        help=\'\'\'Specifies how to combine asymmetric distances. One of avg\n-                             (average of A->B and B->A) or min (minimum of A->B and B->A).\'\'\')\n-    group.add_argument(\'--link\', action=\'store\', dest=\'linkage\',\n-                        choices=(\'single\', \'average\', \'complete\'), default=default_linkage,\n-                        help=\'\'\'Type of linkage to use for hierarchical clustering.\'\'\')\n-    group.add_argument(\'--maxmiss\', action=\'store\', dest=\'max_missing\', type=int,\n-                        default=default_max_missing,\n-                        help=\'\'\'The maximum number of non-ACGT characters (gaps or Ns) to \n-                             permit in the junction sequence before excluding the record \n-                             from clonal assignment. Note, under single linkage \n-                             non-informative positions can create artifactual links \n-                             between unrelated sequences. Use with caution.\'\'\')\n-    parser.set_defaults(group_func=groupByGene)\n-    parser.set_defaults(clone_func=distanceClones)\n-        \n-    return parser\n-\n-\n-if __name__ == \'__main__\':\n-    """\n-    Parses command line arguments and calls main function\n-    """\n-    # Parse arguments\n-    parser = getArgParser()\n-    checkArgs(parser)\n-    args = parser.parse_args()\n-    args_dict = parseCommonArgs(args)\n-\n-    # # Set default fields if not specified.\n-    # default_fields = {\'seq_field\': default_junction_field,\n-    #                   \'v_field\': default_v_field,\n-    #                   \'j_field\': default_j_field}\n-    #\n-    # # Default Change-O fields\n-    # if args_dict[\'format\'] == \'changeo\':\n-    #     for f in default_fields:\n-    #         if args_dict[f] is None:  args_dict[f] = default_fields[f]\n-    #         else: args_dict[f] = args_dict[f].upper()\n-    #\n-    # # Default AIRR fields\n-    # if args_dict[\'format\'] == \'airr\':\n-    #     for f in default_fields:\n-    #         if args_dict[f] is None:  args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])\n-    #         else: args_dict[f] = args_dict[f].lower()\n-\n-    # Define grouping and cloning function arguments\n-    args_dict[\'group_args\'] = {\'action\': args_dict[\'action\'],\n-                               \'mode\':args_dict[\'mode\']}\n-    args_dict[\'clone_args\'] = {\'model\':  args_dict[\'model\'],\n-                               \'distance\':  args_dict[\'distance\'],\n-                               \'norm\': args_dict[\'norm\'],\n-                               \'sym\': args_dict[\'sym\'],\n-                               \'linkage\': args_dict[\'linkage\']}\n-\n-    # Get distance matrix\n-    try:\n-        args_dict[\'clone_args\'][\'dist_mat\'] = distance_models[args_dict[\'model\']]\n-    except KeyError:\n-        printError(\'Unrecognized distance model: %s\' % args_dict[\'model\'])\n-\n-    # Clean argument dictionary\n-    del args_dict[\'action\']\n-    del args_dict[\'mode\']\n-    del args_dict[\'model\']\n-    del args_dict[\'distance\']\n-    del args_dict[\'norm\']\n-    del args_dict[\'sym\']\n-    del args_dict[\'linkage\']\n-\n-    # Clean arguments dictionary\n-    del args_dict[\'db_files\']\n-    if \'out_files\' in args_dict: del args_dict[\'out_files\']\n-\n-    # Call main function for each input file\n-    for i, f in enumerate(args.__dict__[\'db_files\']):\n-        args_dict[\'db_file\'] = f\n-        args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n-            if args.__dict__[\'out_files\'] else None\n-        defineClones(**args_dict)\n'
b
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/MakeDb.py
--- a/change_o/MakeDb.py Mon Aug 31 11:20:08 2020 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,885 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Create tab-delimited database file to store sequence alignment information\n-"""\n-\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import csv\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from textwrap import dedent\n-from time import time\n-from Bio import SeqIO\n-\n-# Presto and changeo imports\n-from presto.Annotation import parseAnnotation\n-from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile\n-from changeo.Defaults import default_format, default_out_args\n-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n-from changeo.Alignment import RegionDefinition\n-from changeo.Gene import buildGermline\n-from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \\\n-                       AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader\n-from changeo.Receptor import ChangeoSchema, AIRRSchema\n-\n-# 10X Receptor attributes\n-cellranger_base = [\'cell\', \'c_call\', \'conscount\', \'umicount\']\n-cellranger_extended = [\'cell\', \'c_call\', \'conscount\', \'umicount\',\n-                       \'v_call_10x\', \'d_call_10x\', \'j_call_10x\',\n-                       \'junction_10x\', \'junction_10x_aa\']\n-\n-def readCellRanger(cellranger_file, fields=cellranger_base):\n-    """\n-    Load a Cell Ranger annotation table\n-\n-    Arguments:\n-      cellranger_file (str): path to the annotation file.\n-      fields (list): list of fields to keep.\n-\n-    Returns:\n-      dict: dict of dicts with contig_id as the primary key.\n-    """\n-    # Mapping of 10X annotations to Receptor attributes\n-    cellranger_map = {\'cell\':  \'barcode\',\n-                      \'c_call\': \'c_gene\',\n-                      \'locus\': \'chain\',\n-                      \'conscount\': \'reads\',\n-                      \'umicount\': \'umis\',\n-                      \'v_call_10x\': \'v_gene\',\n-                      \'d_call_10x\': \'d_gene\',\n-                      \'j_call_10x\': \'j_gene\',\n-                      \'junction_10x\': \'cdr3_nt\',\n-                      \'junction_10x_aa\': \'cdr3\'}\n-\n-    # Function to parse individual fields\n-    def _parse(x):\n-        return \'\' if x == \'None\' else x\n-\n-    # Generate annotation dictionary\n-    ann_dict = {}\n-    with open(cellranger_file) as csv_file:\n-        # Detect delimiters\n-        dialect = csv.Sniffer().sniff(csv_file.readline())\n-        csv_file.seek(0)\n-        # Read in annotation file\n-        csv_reader = csv.DictReader(csv_file, dialect=dialect)\n-\n-        # Generate annotation dictionary\n-        for row in csv_reader:\n-            ann_dict[row[\'contig_id\']] = {f: _parse(row[cellranger_map[f]]) for f in fields}\n-\n-    return ann_dict\n-\n-\n-def addGermline(receptor, references, amino_acid=False):\n-    """\n-    Add full length germline to Receptor object\n-\n-    Arguments:\n-      receptor (changeo.Receptor.Receptor): Receptor object to modify.\n-      references (dict): dictionary of IMGT-gapped references sequences.\n-      amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline\n-\n-    Returns:\n-      changeo.Receptor.Receptor: modified Receptor with the germline sequence added.\n-    """\n-    if amino_acid:\n-        __, germlines, __ = buildGermline(receptor, references, seq_field=\'sequence_aa_imgt\',\n-                                          amino_acid=True)\n-        germline_seq = None if germlines is None else germlines[\'full\']\n-        receptor.setField(\'germline_aa_imgt\', germline_seq)\n-    else:\n-        __, germlines, __ = buildGermline(receptor, references, amino_acid=False)\n-        germline_seq = None if germlines is None else germlines[\'full\']\n-        receptor.setField(\'germline_imgt\', germline_seq)\n-\n-    return receptor\n-\n-\n-def getIDforIMGT(seq_file):\n-    """\n-    Create a sequenc'..b'         required=True,\n-                             help=\'\'\'iHMMune-Align output file.\'\'\')\n-    group_ihmm.add_argument(\'-r\', nargs=\'+\', action=\'store\', dest=\'repo\', required=True,\n-                             help=\'\'\'List of folders and/or FASTA files containing\n-                                   the set of germline sequences used by iHMMune-Align. These\n-                                   reference sequences must contain IMGT-numbering spacers (gaps)\n-                                   in the V segment.\'\'\')\n-    group_ihmm.add_argument(\'-s\', action=\'store\', nargs=\'+\', dest=\'seq_files\',\n-                             required=True,\n-                             help=\'\'\'List of input FASTA files (with .fasta, .fna or .fa\n-                                  extension) containing sequences.\'\'\')\n-    group_ihmm.add_argument(\'--10x\', action=\'store\', nargs=\'+\', dest=\'cellranger_file\',\n-                                help=\'\'\'Table file containing 10X annotations (with .csv or .tsv\n-                                     extension).\'\'\')\n-    group_ihmm.add_argument(\'--asis-id\', action=\'store_true\', dest=\'asis_id\',\n-                             help=\'\'\'Specify to prevent input sequence headers from being parsed\n-                                  to add new columns to database. Parsing of sequence headers requires\n-                                  headers to be in the pRESTO annotation format, so this should be specified\n-                                  when sequence headers are incompatible with the pRESTO annotation scheme.\n-                                  Note, unrecognized header formats will default to this behavior.\'\'\')\n-    group_ihmm.add_argument(\'--partial\', action=\'store_true\', dest=\'partial\',\n-                             help=\'\'\'If specified, include incomplete V(D)J alignments in\n-                                  the pass file instead of the fail file. An incomplete alignment\n-                                     is defined as a record for which a valid IMGT-gapped sequence \n-                                     cannot be built or that is missing a V gene assignment, \n-                                     J gene assignment, junction region, or productivity call.\'\'\')\n-    group_ihmm.add_argument(\'--extended\', action=\'store_true\', dest=\'extended\',\n-                             help=\'\'\'Specify to include additional aligner specific fields in the output. \n-                                  Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;\n-                                  adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.\'\'\')\n-    parser_ihmm.set_defaults(func=parseIHMM)\n-\n-    return parser\n-    \n-    \n-if __name__ == "__main__":\n-    """\n-    Parses command line arguments and calls main\n-    """\n-    parser = getArgParser()\n-    checkArgs(parser)\n-    args = parser.parse_args()\n-    args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n-\n-    # Set no ID parsing if sequence files are not provided\n-    if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n-        args_dict[\'asis_id\'] = True\n-\n-    # Delete\n-    if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n-    if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n-    if \'out_files\' in args_dict: del args_dict[\'out_files\']\n-    if \'command\' in args_dict: del args_dict[\'command\']\n-    if \'func\' in args_dict: del args_dict[\'func\']           \n-\n-    # Call main\n-    for i, f in enumerate(args.__dict__[\'aligner_files\']):\n-        args_dict[\'aligner_file\'] = f\n-        args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n-                                if args.__dict__[\'seq_files\'] else None\n-        args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n-                                if args.__dict__[\'out_files\'] else None\n-        args_dict[\'cellranger_file\'] = args.__dict__[\'cellranger_file\'][i] \\\n-                                if args.__dict__[\'cellranger_file\'] else None\n-        args.func(**args_dict)\n'
b
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/define_clones.sh
--- a/change_o/define_clones.sh Mon Aug 31 11:20:08 2020 -0400
+++ b/change_o/define_clones.sh Tue Sep 01 16:03:44 2020 -0400
b
@@ -21,7 +21,7 @@
  output=${10}
  output2=${11}
 
- python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+ DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
 
  Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 else
@@ -29,7 +29,7 @@
  output=$4
  output2=$5
 
- python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+ DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
 
  Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 fi
b
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/makedb.sh
--- a/change_o/makedb.sh Mon Aug 31 11:20:08 2020 -0400
+++ b/change_o/makedb.sh Tue Sep 01 16:03:44 2020 -0400
b
@@ -29,7 +29,7 @@
 
 echo "makedb: $PWD/outdir"
 
-python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
 
 mv $PWD/outdir/output_db-pass.tab $output