Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 78:aff3ba86ef7a (2020-08-31)
Previous changeset 77:58d2377b507d (2019-06-19) Next changeset 79:98e3fecedd2b (2020-09-01)
Commit message:
Uploaded
modified:
LICENSE
README.md
aa_histogram.r
baseline/Baseline_Functions.r
baseline/Baseline_Main.r
baseline/FiveS_Mutability.RData
baseline/FiveS_Substitution.RData
baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
baseline/IMGTVHreferencedataset20161215.fa
baseline/IMGTVHreferencedataset20161215.fasta
baseline/baseline_url.txt
baseline/comparePDFs.r
baseline/filter.r
baseline/script_imgt.py
baseline/script_xlsx.py
baseline/wrapper.sh
change_o/change_o_url.txt
change_o/define_clones.r
change_o/define_clones.sh
change_o/makedb.sh
change_o/select_first_in_clone.r
check_unique_id.r
datatypes_conf.xml
gene_identification.py
imgt_loader.r
merge.r
merge_and_filter.r
naive_output.r
new_imgt.r
pattern_plots.r
plot_pdf.r
sequence_overview.r
shm_clonality.htm
shm_csr.htm
shm_csr.py
shm_csr.r
shm_csr.xml
shm_downloads.htm
shm_first.htm
shm_frequency.htm
shm_overview.htm
shm_selection.htm
shm_transition.htm
style.tar.gz
subclass_definition.db.nhr
subclass_definition.db.nin
subclass_definition.db.nsq
summary_to_fasta.py
wrapper.sh
added:
.gitattributes
.gitignore
change_o/DefineClones.py
change_o/MakeDb.py
mutation_column_checker.py
b
diff -r 58d2377b507d -r aff3ba86ef7a .gitattributes
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.gitattributes Mon Aug 31 11:20:08 2020 -0400
b
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
b
diff -r 58d2377b507d -r aff3ba86ef7a .gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.gitignore Mon Aug 31 11:20:08 2020 -0400
b
@@ -0,0 +1,4 @@
+
+shm_csr\.tar\.gz
+
+\.vscode/settings\.json
b
diff -r 58d2377b507d -r aff3ba86ef7a change_o/DefineClones.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/DefineClones.py Mon Aug 31 11:20:08 2020 -0400
[
b'@@ -0,0 +1,739 @@\n+#!/usr/bin/env python3\n+"""\n+Assign Ig sequences into clones\n+"""\n+\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import os\n+import re\n+import sys\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from itertools import chain\n+from textwrap import dedent\n+from time import time\n+from Bio.Seq import translate\n+\n+# Presto and changeo imports\n+from presto.Defaults import default_out_args\n+from presto.IO import printLog, printProgress, printCount, printWarning, printError\n+from presto.Multiprocessing import manageProcesses\n+from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field\n+from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n+from changeo.Distance import distance_models, calcDistances, formClusters\n+from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \\\n+                       AIRRWriter, ChangeoWriter\n+from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue\n+\n+# Defaults\n+default_translate = False\n+default_distance = 0.0\n+default_index_mode = \'gene\'\n+default_index_action = \'set\'\n+default_distance_model = \'ham\'\n+default_norm = \'len\'\n+default_sym = \'avg\'\n+default_linkage = \'single\'\n+default_max_missing=0\n+choices_distance_model = (\'ham\', \'aa\', \'hh_s1f\', \'hh_s5f\',\n+                          \'mk_rs1nf\', \'mk_rs5nf\',\n+                          \'hs1f_compat\', \'m1n_compat\')\n+\n+\n+def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,\n+                  j_field=default_j_field, max_missing=default_max_missing):\n+    """\n+    Splits a set of sequence into passed and failed groups based on the number\n+    of missing characters in the sequence\n+\n+    Arguments:\n+        data : changeo.Multiprocessing.DbData object.\n+        seq_field : sequence field to filter on.\n+        v_field : field containing the V call.\n+        j_field : field containing the J call.\n+        max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.\n+\n+    Returns:\n+        changeo.Multiprocessing.DbResult : objected containing filtered records.\n+    """\n+    # Function to validate the sequence string\n+    def _pass(seq):\n+        if len(seq) > 0 and len(re.findall(r\'[^ACGT]\', seq)) <= max_missing:\n+            return True\n+        else:\n+            return False\n+\n+    # Define result object for iteration and get data records\n+    result = DbResult(data.id, data.data)\n+\n+    if not data:\n+        result.data_pass = []\n+        result.data_fail = data.data\n+        return result\n+\n+    result.data_pass = []\n+    result.data_fail = []\n+    for rec in data.data:\n+        seq = rec.getField(seq_field)\n+        if _pass(seq):  result.data_pass.append(rec)\n+        else:  result.data_fail.append(rec)\n+\n+    # Add V(D)J to log\n+    result.log[\'ID\'] = \',\'.join([str(x) for x in data.id])\n+    result.log[\'VCALL\'] = \',\'.join(set([(r.getVAllele(field=v_field) or \'\') for r in data.data]))\n+    result.log[\'JCALL\'] = \',\'.join(set([(r.getJAllele(field=j_field) or \'\') for r in data.data]))\n+    result.log[\'JUNCLEN\'] = \',\'.join(set([(str(len(r.junction)) or \'0\') for r in data.data]))\n+    result.log[\'CLONED\'] = len(result.data_pass)\n+    result.log[\'FILTERED\'] = len(result.data_fail)\n+\n+    return result\n+\n+\n+def indexByIdentity(index, key, rec, group_fields=None):\n+    """\n+    Updates a preclone index with a simple key\n+\n+    Arguments:\n+      index : preclone index from groupByGene\n+      key : index key\n+      rec : Receptor to add to the index\n+      group_fields : additional annotation fields to use to group preclones;\n+                     if None use only V, J and junction length\n+\n+    Returns:\n+      None : Updates index with new key and records.\n+    """\n+    index.setdefault(tuple(key), []).append(rec)\n+\n+\n+def i'..b'.add_argument(\'--norm\', action=\'store\', dest=\'norm\',\n+                        choices=(\'len\', \'mut\', \'none\'), default=default_norm,\n+                        help=\'\'\'Specifies how to normalize distances. One of none\n+                             (do not normalize), len (normalize by length),\n+                             or mut (normalize by number of mutations between sequences).\'\'\')\n+    group.add_argument(\'--sym\', action=\'store\', dest=\'sym\',\n+                        choices=(\'avg\', \'min\'), default=default_sym,\n+                        help=\'\'\'Specifies how to combine asymmetric distances. One of avg\n+                             (average of A->B and B->A) or min (minimum of A->B and B->A).\'\'\')\n+    group.add_argument(\'--link\', action=\'store\', dest=\'linkage\',\n+                        choices=(\'single\', \'average\', \'complete\'), default=default_linkage,\n+                        help=\'\'\'Type of linkage to use for hierarchical clustering.\'\'\')\n+    group.add_argument(\'--maxmiss\', action=\'store\', dest=\'max_missing\', type=int,\n+                        default=default_max_missing,\n+                        help=\'\'\'The maximum number of non-ACGT characters (gaps or Ns) to \n+                             permit in the junction sequence before excluding the record \n+                             from clonal assignment. Note, under single linkage \n+                             non-informative positions can create artifactual links \n+                             between unrelated sequences. Use with caution.\'\'\')\n+    parser.set_defaults(group_func=groupByGene)\n+    parser.set_defaults(clone_func=distanceClones)\n+        \n+    return parser\n+\n+\n+if __name__ == \'__main__\':\n+    """\n+    Parses command line arguments and calls main function\n+    """\n+    # Parse arguments\n+    parser = getArgParser()\n+    checkArgs(parser)\n+    args = parser.parse_args()\n+    args_dict = parseCommonArgs(args)\n+\n+    # # Set default fields if not specified.\n+    # default_fields = {\'seq_field\': default_junction_field,\n+    #                   \'v_field\': default_v_field,\n+    #                   \'j_field\': default_j_field}\n+    #\n+    # # Default Change-O fields\n+    # if args_dict[\'format\'] == \'changeo\':\n+    #     for f in default_fields:\n+    #         if args_dict[f] is None:  args_dict[f] = default_fields[f]\n+    #         else: args_dict[f] = args_dict[f].upper()\n+    #\n+    # # Default AIRR fields\n+    # if args_dict[\'format\'] == \'airr\':\n+    #     for f in default_fields:\n+    #         if args_dict[f] is None:  args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])\n+    #         else: args_dict[f] = args_dict[f].lower()\n+\n+    # Define grouping and cloning function arguments\n+    args_dict[\'group_args\'] = {\'action\': args_dict[\'action\'],\n+                               \'mode\':args_dict[\'mode\']}\n+    args_dict[\'clone_args\'] = {\'model\':  args_dict[\'model\'],\n+                               \'distance\':  args_dict[\'distance\'],\n+                               \'norm\': args_dict[\'norm\'],\n+                               \'sym\': args_dict[\'sym\'],\n+                               \'linkage\': args_dict[\'linkage\']}\n+\n+    # Get distance matrix\n+    try:\n+        args_dict[\'clone_args\'][\'dist_mat\'] = distance_models[args_dict[\'model\']]\n+    except KeyError:\n+        printError(\'Unrecognized distance model: %s\' % args_dict[\'model\'])\n+\n+    # Clean argument dictionary\n+    del args_dict[\'action\']\n+    del args_dict[\'mode\']\n+    del args_dict[\'model\']\n+    del args_dict[\'distance\']\n+    del args_dict[\'norm\']\n+    del args_dict[\'sym\']\n+    del args_dict[\'linkage\']\n+\n+    # Clean arguments dictionary\n+    del args_dict[\'db_files\']\n+    if \'out_files\' in args_dict: del args_dict[\'out_files\']\n+\n+    # Call main function for each input file\n+    for i, f in enumerate(args.__dict__[\'db_files\']):\n+        args_dict[\'db_file\'] = f\n+        args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n+            if args.__dict__[\'out_files\'] else None\n+        defineClones(**args_dict)\n'
b
diff -r 58d2377b507d -r aff3ba86ef7a change_o/MakeDb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/MakeDb.py Mon Aug 31 11:20:08 2020 -0400
[
b'@@ -0,0 +1,885 @@\n+#!/usr/bin/env python3\n+"""\n+Create tab-delimited database file to store sequence alignment information\n+"""\n+\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import os\n+import re\n+import csv\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from textwrap import dedent\n+from time import time\n+from Bio import SeqIO\n+\n+# Presto and changeo imports\n+from presto.Annotation import parseAnnotation\n+from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile\n+from changeo.Defaults import default_format, default_out_args\n+from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs\n+from changeo.Alignment import RegionDefinition\n+from changeo.Gene import buildGermline\n+from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \\\n+                       AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader\n+from changeo.Receptor import ChangeoSchema, AIRRSchema\n+\n+# 10X Receptor attributes\n+cellranger_base = [\'cell\', \'c_call\', \'conscount\', \'umicount\']\n+cellranger_extended = [\'cell\', \'c_call\', \'conscount\', \'umicount\',\n+                       \'v_call_10x\', \'d_call_10x\', \'j_call_10x\',\n+                       \'junction_10x\', \'junction_10x_aa\']\n+\n+def readCellRanger(cellranger_file, fields=cellranger_base):\n+    """\n+    Load a Cell Ranger annotation table\n+\n+    Arguments:\n+      cellranger_file (str): path to the annotation file.\n+      fields (list): list of fields to keep.\n+\n+    Returns:\n+      dict: dict of dicts with contig_id as the primary key.\n+    """\n+    # Mapping of 10X annotations to Receptor attributes\n+    cellranger_map = {\'cell\':  \'barcode\',\n+                      \'c_call\': \'c_gene\',\n+                      \'locus\': \'chain\',\n+                      \'conscount\': \'reads\',\n+                      \'umicount\': \'umis\',\n+                      \'v_call_10x\': \'v_gene\',\n+                      \'d_call_10x\': \'d_gene\',\n+                      \'j_call_10x\': \'j_gene\',\n+                      \'junction_10x\': \'cdr3_nt\',\n+                      \'junction_10x_aa\': \'cdr3\'}\n+\n+    # Function to parse individual fields\n+    def _parse(x):\n+        return \'\' if x == \'None\' else x\n+\n+    # Generate annotation dictionary\n+    ann_dict = {}\n+    with open(cellranger_file) as csv_file:\n+        # Detect delimiters\n+        dialect = csv.Sniffer().sniff(csv_file.readline())\n+        csv_file.seek(0)\n+        # Read in annotation file\n+        csv_reader = csv.DictReader(csv_file, dialect=dialect)\n+\n+        # Generate annotation dictionary\n+        for row in csv_reader:\n+            ann_dict[row[\'contig_id\']] = {f: _parse(row[cellranger_map[f]]) for f in fields}\n+\n+    return ann_dict\n+\n+\n+def addGermline(receptor, references, amino_acid=False):\n+    """\n+    Add full length germline to Receptor object\n+\n+    Arguments:\n+      receptor (changeo.Receptor.Receptor): Receptor object to modify.\n+      references (dict): dictionary of IMGT-gapped references sequences.\n+      amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline\n+\n+    Returns:\n+      changeo.Receptor.Receptor: modified Receptor with the germline sequence added.\n+    """\n+    if amino_acid:\n+        __, germlines, __ = buildGermline(receptor, references, seq_field=\'sequence_aa_imgt\',\n+                                          amino_acid=True)\n+        germline_seq = None if germlines is None else germlines[\'full\']\n+        receptor.setField(\'germline_aa_imgt\', germline_seq)\n+    else:\n+        __, germlines, __ = buildGermline(receptor, references, amino_acid=False)\n+        germline_seq = None if germlines is None else germlines[\'full\']\n+        receptor.setField(\'germline_imgt\', germline_seq)\n+\n+    return receptor\n+\n+\n+def getIDforIMGT(seq_file):\n+    """\n+    Create a sequenc'..b'         required=True,\n+                             help=\'\'\'iHMMune-Align output file.\'\'\')\n+    group_ihmm.add_argument(\'-r\', nargs=\'+\', action=\'store\', dest=\'repo\', required=True,\n+                             help=\'\'\'List of folders and/or FASTA files containing\n+                                   the set of germline sequences used by iHMMune-Align. These\n+                                   reference sequences must contain IMGT-numbering spacers (gaps)\n+                                   in the V segment.\'\'\')\n+    group_ihmm.add_argument(\'-s\', action=\'store\', nargs=\'+\', dest=\'seq_files\',\n+                             required=True,\n+                             help=\'\'\'List of input FASTA files (with .fasta, .fna or .fa\n+                                  extension) containing sequences.\'\'\')\n+    group_ihmm.add_argument(\'--10x\', action=\'store\', nargs=\'+\', dest=\'cellranger_file\',\n+                                help=\'\'\'Table file containing 10X annotations (with .csv or .tsv\n+                                     extension).\'\'\')\n+    group_ihmm.add_argument(\'--asis-id\', action=\'store_true\', dest=\'asis_id\',\n+                             help=\'\'\'Specify to prevent input sequence headers from being parsed\n+                                  to add new columns to database. Parsing of sequence headers requires\n+                                  headers to be in the pRESTO annotation format, so this should be specified\n+                                  when sequence headers are incompatible with the pRESTO annotation scheme.\n+                                  Note, unrecognized header formats will default to this behavior.\'\'\')\n+    group_ihmm.add_argument(\'--partial\', action=\'store_true\', dest=\'partial\',\n+                             help=\'\'\'If specified, include incomplete V(D)J alignments in\n+                                  the pass file instead of the fail file. An incomplete alignment\n+                                     is defined as a record for which a valid IMGT-gapped sequence \n+                                     cannot be built or that is missing a V gene assignment, \n+                                     J gene assignment, junction region, or productivity call.\'\'\')\n+    group_ihmm.add_argument(\'--extended\', action=\'store_true\', dest=\'extended\',\n+                             help=\'\'\'Specify to include additional aligner specific fields in the output. \n+                                  Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;\n+                                  adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.\'\'\')\n+    parser_ihmm.set_defaults(func=parseIHMM)\n+\n+    return parser\n+    \n+    \n+if __name__ == "__main__":\n+    """\n+    Parses command line arguments and calls main\n+    """\n+    parser = getArgParser()\n+    checkArgs(parser)\n+    args = parser.parse_args()\n+    args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n+\n+    # Set no ID parsing if sequence files are not provided\n+    if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n+        args_dict[\'asis_id\'] = True\n+\n+    # Delete\n+    if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n+    if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n+    if \'out_files\' in args_dict: del args_dict[\'out_files\']\n+    if \'command\' in args_dict: del args_dict[\'command\']\n+    if \'func\' in args_dict: del args_dict[\'func\']           \n+\n+    # Call main\n+    for i, f in enumerate(args.__dict__[\'aligner_files\']):\n+        args_dict[\'aligner_file\'] = f\n+        args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n+                                if args.__dict__[\'seq_files\'] else None\n+        args_dict[\'out_file\'] = args.__dict__[\'out_files\'][i] \\\n+                                if args.__dict__[\'out_files\'] else None\n+        args_dict[\'cellranger_file\'] = args.__dict__[\'cellranger_file\'][i] \\\n+                                if args.__dict__[\'cellranger_file\'] else None\n+        args.func(**args_dict)\n'
b
diff -r 58d2377b507d -r aff3ba86ef7a change_o/define_clones.sh
--- a/change_o/define_clones.sh Wed Jun 19 04:31:44 2019 -0400
+++ b/change_o/define_clones.sh Mon Aug 31 11:20:08 2020 -0400
b
@@ -21,7 +21,7 @@
  output=${10}
  output2=${11}
 
- DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+ python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
 
  Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 else
@@ -29,7 +29,7 @@
  output=$4
  output2=$5
 
- DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+ python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
 
  Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 fi
b
diff -r 58d2377b507d -r aff3ba86ef7a change_o/makedb.sh
--- a/change_o/makedb.sh Wed Jun 19 04:31:44 2019 -0400
+++ b/change_o/makedb.sh Mon Aug 31 11:20:08 2020 -0400
b
@@ -29,7 +29,7 @@
 
 echo "makedb: $PWD/outdir"
 
-MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
 
 mv $PWD/outdir/output_db-pass.tab $output
 
b
diff -r 58d2377b507d -r aff3ba86ef7a merge_and_filter.r
--- a/merge_and_filter.r Wed Jun 19 04:31:44 2019 -0400
+++ b/merge_and_filter.r Mon Aug 31 11:20:08 2020 -0400
[
@@ -53,10 +53,6 @@
 hotspots = fix_column_names(hotspots)
 AAs = fix_column_names(AAs)
 
-if(!("Sequence.number" %in% names(summ))){ 
- summ["Sequence.number"] = 1:nrow(summ)
-}
-
 if(method == "blastn"){
  #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
  gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -173,17 +169,10 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
-missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
-missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
-missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
-missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
-
-print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
-print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
-print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
-print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
-print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
+print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
 
 if(empty.region.filter == "leader"){
  result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
b
diff -r 58d2377b507d -r aff3ba86ef7a mutation_column_checker.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_column_checker.py Mon Aug 31 11:20:08 2020 -0400
[
@@ -0,0 +1,27 @@
+import re
+
+mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+
+with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle:
+    first = True
+    fr3_index = -1
+    for i, line in enumerate(file_handle):
+        line_split = line.split("\t")
+        if first:
+            fr3_index = line_split.index("FR3-IMGT")
+            first = False
+            continue
+
+        if len(line_split) < fr3_index:
+            continue
+        
+        fr3_data = line_split[fr3_index]
+        if len(fr3_data) > 5:
+            try:
+                test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x]
+            except:
+                print(line_split[1])
+                print("Something went wrong at line {line} with:".format(line=line_split[0]))
+                #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)])
+        if i % 100000 == 0:
+            print(i)
b
diff -r 58d2377b507d -r aff3ba86ef7a shm_csr.xml
--- a/shm_csr.xml Wed Jun 19 04:31:44 2019 -0400
+++ b/shm_csr.xml Mon Aug 31 11:20:08 2020 -0400
b
@@ -2,13 +2,13 @@
  <description></description>
  <requirements>
  <requirement type="package" version="2.7">python</requirement>
+ <requirement type="package" version="1.16.0">numpy</requirement>
  <requirement type="package" version="1.2.0">xlrd</requirement>
  <requirement type="package" version="3.0.0">r-ggplot2</requirement>
  <requirement type="package" version="1.4.3">r-reshape2</requirement>
  <requirement type="package" version="0.5.0">r-scales</requirement>
  <requirement type="package" version="3.4_5">r-seqinr</requirement>
  <requirement type="package" version="1.11.4">r-data.table</requirement>
- <!--<requirement type="package" version="0.4.5">changeo</requirement>-->
  </requirements>
  <command interpreter="bash">
  #if str ( $filter_unique.filter_unique_select ) == "remove":