Previous changeset 78:aff3ba86ef7a (2020-08-31) Next changeset 80:a4617f1d1d89 (2021-02-19) |
Commit message:
Uploaded |
modified:
change_o/define_clones.sh change_o/makedb.sh |
removed:
change_o/DefineClones.py change_o/MakeDb.py |
b |
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/DefineClones.py --- a/change_o/DefineClones.py Mon Aug 31 11:20:08 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,739 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Assign Ig sequences into clones\n-"""\n-\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import sys\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from itertools import chain\n-from textwrap import dedent\n-from time import time\n-from Bio.Seq import translate\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.IO import printLog, printProgress, printCount, printWarning, printError\n-from presto.Multiprocessing import manageProcesses\n-from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field\n-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n-from changeo.Distance import distance_models, calcDistances, formClusters\n-from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \\\n- AIRRWriter, ChangeoWriter\n-from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue\n-\n-# Defaults\n-default_translate = False\n-default_distance = 0.0\n-default_index_mode = \'gene\'\n-default_index_action = \'set\'\n-default_distance_model = \'ham\'\n-default_norm = \'len\'\n-default_sym = \'avg\'\n-default_linkage = \'single\'\n-default_max_missing=0\n-choices_distance_model = (\'ham\', \'aa\', \'hh_s1f\', \'hh_s5f\',\n- \'mk_rs1nf\', \'mk_rs5nf\',\n- \'hs1f_compat\', \'m1n_compat\')\n-\n-\n-def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,\n- j_field=default_j_field, max_missing=default_max_missing):\n- """\n- Splits a set of sequence into passed and failed groups based on the number\n- of missing characters in the sequence\n-\n- Arguments:\n- data : changeo.Multiprocessing.DbData object.\n- seq_field : sequence field to filter on.\n- v_field : field containing the V call.\n- j_field : field containing the J call.\n- max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.\n-\n- Returns:\n- changeo.Multiprocessing.DbResult : objected containing filtered records.\n- """\n- # Function to validate the sequence string\n- def _pass(seq):\n- if len(seq) > 0 and len(re.findall(r\'[^ACGT]\', seq)) <= max_missing:\n- return True\n- else:\n- return False\n-\n- # Define result object for iteration and get data records\n- result = DbResult(data.id, data.data)\n-\n- if not data:\n- result.data_pass = []\n- result.data_fail = data.data\n- return result\n-\n- result.data_pass = []\n- result.data_fail = []\n- for rec in data.data:\n- seq = rec.getField(seq_field)\n- if _pass(seq): result.data_pass.append(rec)\n- else: result.data_fail.append(rec)\n-\n- # Add V(D)J to log\n- result.log[\'ID\'] = \',\'.join([str(x) for x in data.id])\n- result.log[\'VCALL\'] = \',\'.join(set([(r.getVAllele(field=v_field) or \'\') for r in data.data]))\n- result.log[\'JCALL\'] = \',\'.join(set([(r.getJAllele(field=j_field) or \'\') for r in data.data]))\n- result.log[\'JUNCLEN\'] = \',\'.join(set([(str(len(r.junction)) or \'0\') for r in data.data]))\n- result.log[\'CLONED\'] = len(result.data_pass)\n- result.log[\'FILTERED\'] = len(result.data_fail)\n-\n- return result\n-\n-\n-def indexByIdentity(index, key, rec, group_fields=None):\n- """\n- Updates a preclone index with a simple key\n-\n- Arguments:\n- index : preclone index from groupByGene\n- key : index key\n- rec : Receptor to add to the index\n- group_fields : additional annotation fields to use to group preclones;\n- if None use only V, J and junction length\n-\n- Returns:\n- None : Updates index with new key and records.\n- """\n- index.setdefault(tuple(key), []).append(rec)\n-\n-\n-def i'..b'.add_argument(\'--norm\', action=\'store\', dest=\'norm\',\n- choices=(\'len\', \'mut\', \'none\'), default=default_norm,\n- help=\'\'\'Specifies how to normalize distances. One of none\n- (do not normalize), len (normalize by length),\n- or mut (normalize by number of mutations between sequences).\'\'\')\n- group.add_argument(\'--sym\', action=\'store\', dest=\'sym\',\n- choices=(\'avg\', \'min\'), default=default_sym,\n- help=\'\'\'Specifies how to combine asymmetric distances. One of avg\n- (average of A->B and B->A) or min (minimum of A->B and B->A).\'\'\')\n- group.add_argument(\'--link\', action=\'store\', dest=\'linkage\',\n- choices=(\'single\', \'average\', \'complete\'), default=default_linkage,\n- help=\'\'\'Type of linkage to use for hierarchical clustering.\'\'\')\n- group.add_argument(\'--maxmiss\', action=\'store\', dest=\'max_missing\', type=int,\n- default=default_max_missing,\n- help=\'\'\'The maximum number of non-ACGT characters (gaps or Ns) to \n- permit in the junction sequence before excluding the record \n- from clonal assignment. Note, under single linkage \n- non-informative positions can create artifactual links \n- between unrelated sequences. Use with caution.\'\'\')\n- parser.set_defaults(group_func=groupByGene)\n- parser.set_defaults(clone_func=distanceClones)\n- \n- return parser\n-\n-\n-if __name__ == \'__main__\':\n- """\n- Parses command line arguments and calls main function\n- """\n- # Parse arguments\n- parser = getArgParser()\n- checkArgs(parser)\n- args = parser.parse_args()\n- args_dict = parseCommonArgs(args)\n-\n- # # Set default fields if not specified.\n- # default_fields = {\'seq_field\': default_junction_field,\n- # \'v_field\': default_v_field,\n- # \'j_field\': default_j_field}\n- #\n- # # Default Change-O fields\n- # if args_dict[\'format\'] == \'changeo\':\n- # for f in default_fields:\n- # if args_dict[f] is None: args_dict[f] = default_fields[f]\n- # else: args_dict[f] = args_dict[f].upper()\n- #\n- # # Default AIRR fields\n- # if args_dict[\'format\'] == \'airr\':\n- # for f in default_fields:\n- # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])\n- # else: args_dict[f] = args_dict[f].lower()\n-\n- # Define grouping and cloning function arguments\n- args_dict[\'group_args\'] = {\'action\': args_dict[\'action\'],\n- \'mode\':args_dict[\'mode\']}\n- args_dict[\'clone_args\'] = {\'model\': args_dict[\'model\'],\n- \'distance\': args_dict[\'distance\'],\n- \'norm\': args_dict[\'norm\'],\n- \'sym\': args_dict[\'sym\'],\n- \'linkage\': args_dict[\'linkage\']}\n-\n- # Get distance matrix\n- try:\n- args_dict[\'clone_args\'][\'dist_mat\'] = distance_models[args_dict[\'model\']]\n- except KeyError:\n- printError(\'Unrecognized distance model: %s\' % args_dict[\'model\'])\n-\n- # Clean argument dictionary\n- del args_dict[\'action\']\n- del args_dict[\'mode\']\n- del args_dict[\'model\']\n- del args_dict[\'distance\']\n- del args_dict[\'norm\']\n- del args_dict[\'sym\']\n- del args_dict[\'linkage\']\n-\n- # Clean arguments dictionary\n- del args_dict[\'db_files\']\n- if \'out_files\' in args_dict: del args_dict[\'out_files\']\n-\n- # Call main function for each input file\n- for i, f in enumerate(args.__dict__[\'db_files\']):\n- args_dict[\'db_file\'] = f\n- args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n- if args.__dict__[\'out_files\'] else None\n- defineClones(**args_dict)\n' |
b |
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/MakeDb.py --- a/change_o/MakeDb.py Mon Aug 31 11:20:08 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,885 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Create tab-delimited database file to store sequence alignment information\n-"""\n-\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import csv\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from textwrap import dedent\n-from time import time\n-from Bio import SeqIO\n-\n-# Presto and changeo imports\n-from presto.Annotation import parseAnnotation\n-from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile\n-from changeo.Defaults import default_format, default_out_args\n-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n-from changeo.Alignment import RegionDefinition\n-from changeo.Gene import buildGermline\n-from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \\\n- AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader\n-from changeo.Receptor import ChangeoSchema, AIRRSchema\n-\n-# 10X Receptor attributes\n-cellranger_base = [\'cell\', \'c_call\', \'conscount\', \'umicount\']\n-cellranger_extended = [\'cell\', \'c_call\', \'conscount\', \'umicount\',\n- \'v_call_10x\', \'d_call_10x\', \'j_call_10x\',\n- \'junction_10x\', \'junction_10x_aa\']\n-\n-def readCellRanger(cellranger_file, fields=cellranger_base):\n- """\n- Load a Cell Ranger annotation table\n-\n- Arguments:\n- cellranger_file (str): path to the annotation file.\n- fields (list): list of fields to keep.\n-\n- Returns:\n- dict: dict of dicts with contig_id as the primary key.\n- """\n- # Mapping of 10X annotations to Receptor attributes\n- cellranger_map = {\'cell\': \'barcode\',\n- \'c_call\': \'c_gene\',\n- \'locus\': \'chain\',\n- \'conscount\': \'reads\',\n- \'umicount\': \'umis\',\n- \'v_call_10x\': \'v_gene\',\n- \'d_call_10x\': \'d_gene\',\n- \'j_call_10x\': \'j_gene\',\n- \'junction_10x\': \'cdr3_nt\',\n- \'junction_10x_aa\': \'cdr3\'}\n-\n- # Function to parse individual fields\n- def _parse(x):\n- return \'\' if x == \'None\' else x\n-\n- # Generate annotation dictionary\n- ann_dict = {}\n- with open(cellranger_file) as csv_file:\n- # Detect delimiters\n- dialect = csv.Sniffer().sniff(csv_file.readline())\n- csv_file.seek(0)\n- # Read in annotation file\n- csv_reader = csv.DictReader(csv_file, dialect=dialect)\n-\n- # Generate annotation dictionary\n- for row in csv_reader:\n- ann_dict[row[\'contig_id\']] = {f: _parse(row[cellranger_map[f]]) for f in fields}\n-\n- return ann_dict\n-\n-\n-def addGermline(receptor, references, amino_acid=False):\n- """\n- Add full length germline to Receptor object\n-\n- Arguments:\n- receptor (changeo.Receptor.Receptor): Receptor object to modify.\n- references (dict): dictionary of IMGT-gapped references sequences.\n- amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline\n-\n- Returns:\n- changeo.Receptor.Receptor: modified Receptor with the germline sequence added.\n- """\n- if amino_acid:\n- __, germlines, __ = buildGermline(receptor, references, seq_field=\'sequence_aa_imgt\',\n- amino_acid=True)\n- germline_seq = None if germlines is None else germlines[\'full\']\n- receptor.setField(\'germline_aa_imgt\', germline_seq)\n- else:\n- __, germlines, __ = buildGermline(receptor, references, amino_acid=False)\n- germline_seq = None if germlines is None else germlines[\'full\']\n- receptor.setField(\'germline_imgt\', germline_seq)\n-\n- return receptor\n-\n-\n-def getIDforIMGT(seq_file):\n- """\n- Create a sequenc'..b' required=True,\n- help=\'\'\'iHMMune-Align output file.\'\'\')\n- group_ihmm.add_argument(\'-r\', nargs=\'+\', action=\'store\', dest=\'repo\', required=True,\n- help=\'\'\'List of folders and/or FASTA files containing\n- the set of germline sequences used by iHMMune-Align. These\n- reference sequences must contain IMGT-numbering spacers (gaps)\n- in the V segment.\'\'\')\n- group_ihmm.add_argument(\'-s\', action=\'store\', nargs=\'+\', dest=\'seq_files\',\n- required=True,\n- help=\'\'\'List of input FASTA files (with .fasta, .fna or .fa\n- extension) containing sequences.\'\'\')\n- group_ihmm.add_argument(\'--10x\', action=\'store\', nargs=\'+\', dest=\'cellranger_file\',\n- help=\'\'\'Table file containing 10X annotations (with .csv or .tsv\n- extension).\'\'\')\n- group_ihmm.add_argument(\'--asis-id\', action=\'store_true\', dest=\'asis_id\',\n- help=\'\'\'Specify to prevent input sequence headers from being parsed\n- to add new columns to database. Parsing of sequence headers requires\n- headers to be in the pRESTO annotation format, so this should be specified\n- when sequence headers are incompatible with the pRESTO annotation scheme.\n- Note, unrecognized header formats will default to this behavior.\'\'\')\n- group_ihmm.add_argument(\'--partial\', action=\'store_true\', dest=\'partial\',\n- help=\'\'\'If specified, include incomplete V(D)J alignments in\n- the pass file instead of the fail file. An incomplete alignment\n- is defined as a record for which a valid IMGT-gapped sequence \n- cannot be built or that is missing a V gene assignment, \n- J gene assignment, junction region, or productivity call.\'\'\')\n- group_ihmm.add_argument(\'--extended\', action=\'store_true\', dest=\'extended\',\n- help=\'\'\'Specify to include additional aligner specific fields in the output. \n- Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;\n- adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.\'\'\')\n- parser_ihmm.set_defaults(func=parseIHMM)\n-\n- return parser\n- \n- \n-if __name__ == "__main__":\n- """\n- Parses command line arguments and calls main\n- """\n- parser = getArgParser()\n- checkArgs(parser)\n- args = parser.parse_args()\n- args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n-\n- # Set no ID parsing if sequence files are not provided\n- if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n- args_dict[\'asis_id\'] = True\n-\n- # Delete\n- if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n- if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n- if \'out_files\' in args_dict: del args_dict[\'out_files\']\n- if \'command\' in args_dict: del args_dict[\'command\']\n- if \'func\' in args_dict: del args_dict[\'func\'] \n-\n- # Call main\n- for i, f in enumerate(args.__dict__[\'aligner_files\']):\n- args_dict[\'aligner_file\'] = f\n- args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n- if args.__dict__[\'seq_files\'] else None\n- args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n- if args.__dict__[\'out_files\'] else None\n- args_dict[\'cellranger_file\'] = args.__dict__[\'cellranger_file\'][i] \\\n- if args.__dict__[\'cellranger_file\'] else None\n- args.func(**args_dict)\n' |
b |
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/define_clones.sh --- a/change_o/define_clones.sh Mon Aug 31 11:20:08 2020 -0400 +++ b/change_o/define_clones.sh Tue Sep 01 16:03:44 2020 -0400 |
b |
@@ -21,7 +21,7 @@ output=${10} output2=${11} - python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 else @@ -29,7 +29,7 @@ output=$4 output2=$5 - python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 fi |
b |
diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/makedb.sh --- a/change_o/makedb.sh Mon Aug 31 11:20:08 2020 -0400 +++ b/change_o/makedb.sh Tue Sep 01 16:03:44 2020 -0400 |
b |
@@ -29,7 +29,7 @@ echo "makedb: $PWD/outdir" -python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions mv $PWD/outdir/output_db-pass.tab $output |