Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 77:58d2377b507d (2019-06-19)
Previous changeset 76:a93136637bea (2019-06-18) Next changeset 78:aff3ba86ef7a (2020-08-31)
Commit message:
Uploaded
modified:
README.md
merge_and_filter.r
shm_csr.xml
removed:
change_o/DefineClones.py
change_o/MakeDb.py
b
diff -r a93136637bea -r 58d2377b507d README.md
--- a/README.md Tue Jun 18 04:47:44 2019 -0400
+++ b/README.md Wed Jun 19 04:31:44 2019 -0400
[
@@ -1,6 +1,7 @@
 # SHM CSR
 
-Somatic hypermutation and class switch recombination pipeline 
+Somatic hypermutation and class switch recombination pipeline.  
+The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
 
 # Dependencies
 --------------------
@@ -9,4 +10,4 @@
 [Baseline](http://selection.med.yale.edu/baseline/)  
 [R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
 [R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
-[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)
\ No newline at end of file
+[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)
b
diff -r a93136637bea -r 58d2377b507d change_o/DefineClones.py
--- a/change_o/DefineClones.py Tue Jun 18 04:47:44 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,1123 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Assign Ig sequences into clones\n-"""\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import sys\n-import csv\n-import numpy as np\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from itertools import chain\n-from textwrap import dedent\n-from time import time\n-from Bio import pairwise2\n-from Bio.Seq import translate\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.IO import getFileType, getOutputHandle, printLog, printProgress\n-from presto.Multiprocessing import manageProcesses\n-from presto.Sequence import getDNAScoreDict\n-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n-from changeo.Distance import distance_models, calcDistances, formClusters\n-from changeo.IO import getDbWriter, readDbFile, countDbFile\n-from changeo.Multiprocessing import DbData, DbResult\n-\n-## Set maximum field size for csv.reader\n-csv.field_size_limit(sys.maxsize)\n-\n-# Defaults\n-default_translate = False\n-default_distance = 0.0\n-default_index_mode = \'gene\'\n-default_index_action = \'set\'\n-default_bygroup_model = \'ham\'\n-default_hclust_model = \'chen2010\'\n-default_seq_field = \'JUNCTION\'\n-default_norm = \'len\'\n-default_sym = \'avg\'\n-default_linkage = \'single\'\n-choices_bygroup_model = (\'ham\', \'aa\', \'hh_s1f\', \'hh_s5f\', \'mk_rs1nf\', \'mk_rs5nf\', \'hs1f_compat\', \'m1n_compat\')\n-\n-\n-def indexByIdentity(index, key, rec, fields=None):\n-    """\n-    Updates a preclone index with a simple key\n-\n-    Arguments:\n-    index = preclone index from indexJunctions\n-    key = index key\n-    rec = IgRecord to add to the index\n-    fields = additional annotation fields to use to group preclones;\n-             if None use only V, J and junction length\n-\n-    Returns:\n-    None. Updates index with new key and records.\n-    """\n-    index.setdefault(tuple(key), []).append(rec)\n-\n-\n-def indexByUnion(index, key, rec, fields=None):\n-    """\n-    Updates a preclone index with the union of nested keys\n-\n-    Arguments:\n-    index = preclone index from indexJunctions\n-    key = index key\n-    rec = IgRecord to add to the index\n-    fields = additional annotation fields to use to group preclones;\n-             if None use only V, J and junction length\n-\n-    Returns:\n-    None. Updates index with new key and records.\n-    """\n-    # List of values for this/new key\n-    val = [rec]\n-    f_range = list(range(2, 3 + (len(fields) if fields else 0)))\n-\n-    # See if field/junction length combination exists in index\n-    outer_dict = index\n-    for field in f_range:\n-        try:\n-            outer_dict = outer_dict[key[field]]\n-        except (KeyError):\n-            outer_dict = None\n-            break\n-    # If field combination exists, look through Js\n-    j_matches = []\n-    if outer_dict is not None:\n-        for j in outer_dict.keys():\n-            if not set(key[1]).isdisjoint(set(j)):\n-                key[1] = tuple(set(key[1]).union(set(j)))\n-                j_matches += [j]\n-    # If J overlap exists, look through Vs for each J\n-    for j in j_matches:\n-        v_matches = []\n-        # Collect V matches for this J\n-        for v in outer_dict[j].keys():\n-            if not set(key[0]).isdisjoint(set(v)):\n-                key[0] = tuple(set(key[0]).union(set(v)))\n-                v_matches += [v]\n-        # If there are V overlaps for this J, pop them out\n-        if v_matches:\n-            val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))\n-            # If the J dict is now empty, remove it\n-            if not outer_dict[j]:\n-                outer_dict.pop(j, None)\n-\n-    # Add value(s) into index nested dictionary\n-    # OMG Python pointers are the best!\n-    # Add field dictionaries into index\n-    outer_dict = index\n-    for field in f_range:\n-        outer_dict.setdefault(key[field], {})\n-        out'..b'      default=default_seq_field,\n-                                help=\'\'\'The name of the field to be used to calculate\n-                                     distance between records\'\'\')\n-    parser_bygroup.set_defaults(feed_func=feedQueue)\n-    parser_bygroup.set_defaults(work_func=processQueue)\n-    parser_bygroup.set_defaults(collect_func=collectQueue)  \n-    parser_bygroup.set_defaults(group_func=indexJunctions)  \n-    parser_bygroup.set_defaults(clone_func=distanceClones)\n-    \n-    # Chen2010\n-    parser_chen = subparsers.add_parser(\'chen2010\', parents=[parser_parent],\n-                                        formatter_class=CommonHelpFormatter,\n-                                        help=\'\'\'Defines clones by method specified in Chen, 2010.\'\'\',\n-                                        description=\'\'\'Defines clones by method specified in Chen, 2010.\'\'\')\n-    parser_chen.set_defaults(feed_func=feedQueueClust)\n-    parser_chen.set_defaults(work_func=processQueueClust)\n-    parser_chen.set_defaults(collect_func=collectQueueClust)\n-    parser_chen.set_defaults(cluster_func=hierClust)\n-\n-    # Ademokun2011\n-    parser_ade = subparsers.add_parser(\'ademokun2011\', parents=[parser_parent],\n-                                        formatter_class=CommonHelpFormatter,\n-                                        help=\'\'\'Defines clones by method specified in Ademokun, 2011.\'\'\',\n-                                        description=\'\'\'Defines clones by method specified in Ademokun, 2011.\'\'\')\n-    parser_ade.set_defaults(feed_func=feedQueueClust)\n-    parser_ade.set_defaults(work_func=processQueueClust)\n-    parser_ade.set_defaults(collect_func=collectQueueClust)\n-    parser_ade.set_defaults(cluster_func=hierClust)\n-        \n-    return parser\n-\n-\n-if __name__ == \'__main__\':\n-    """\n-    Parses command line arguments and calls main function\n-    """\n-    # Parse arguments\n-    parser = getArgParser()\n-    args = parser.parse_args()\n-    args_dict = parseCommonArgs(args)\n-    # Convert case of fields\n-    if \'seq_field\' in args_dict:\n-        args_dict[\'seq_field\'] = args_dict[\'seq_field\'].upper()\n-    if \'fields\' in args_dict and args_dict[\'fields\'] is not None:  \n-        args_dict[\'fields\'] = [f.upper() for f in args_dict[\'fields\']]\n-    \n-    # Define clone_args\n-    if args.command == \'bygroup\':\n-        args_dict[\'group_args\'] = {\'fields\': args_dict[\'fields\'],\n-                                   \'action\': args_dict[\'action\'], \n-                                   \'mode\':args_dict[\'mode\']}\n-        args_dict[\'clone_args\'] = {\'model\':  args_dict[\'model\'],\n-                                   \'distance\':  args_dict[\'distance\'],\n-                                   \'norm\': args_dict[\'norm\'],\n-                                   \'sym\': args_dict[\'sym\'],\n-                                   \'linkage\': args_dict[\'linkage\'],\n-                                   \'seq_field\': args_dict[\'seq_field\']}\n-\n-        # Get distance matrix\n-        try:\n-            args_dict[\'clone_args\'][\'dist_mat\'] = distance_models[args_dict[\'model\']]\n-        except KeyError:\n-            sys.exit(\'Unrecognized distance model: %s\' % args_dict[\'model\'])\n-\n-        del args_dict[\'fields\']\n-        del args_dict[\'action\']\n-        del args_dict[\'mode\']\n-        del args_dict[\'model\']\n-        del args_dict[\'distance\']\n-        del args_dict[\'norm\']\n-        del args_dict[\'sym\']\n-        del args_dict[\'linkage\']\n-        del args_dict[\'seq_field\']\n-\n-    # Define clone_args\n-    if args.command == \'chen2010\':\n-        args_dict[\'clone_func\'] = distChen2010\n-        args_dict[\'cluster_args\'] = {\'method\': args.command }\n-\n-    if args.command == \'ademokun2011\':\n-        args_dict[\'clone_func\'] = distAdemokun2011\n-        args_dict[\'cluster_args\'] = {\'method\': args.command }\n-    \n-    # Call defineClones\n-    del args_dict[\'command\']\n-    del args_dict[\'db_files\']\n-    for f in args.__dict__[\'db_files\']:\n-        args_dict[\'db_file\'] = f\n-        defineClones(**args_dict)\n'
b
diff -r a93136637bea -r 58d2377b507d change_o/MakeDb.py
--- a/change_o/MakeDb.py Tue Jun 18 04:47:44 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,555 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Create tab-delimited database file to store sequence alignment information\n-"""\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import sys\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from textwrap import dedent\n-from time import time\n-from Bio import SeqIO\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.Annotation import parseAnnotation\n-from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile\n-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n-from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo\n-from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT\n-\n-\n-def getFilePrefix(aligner_output, out_args):\n-    """\n-    Get file name prefix and create output directory\n-\n-    Arguments:\n-      aligner_output : aligner output file or directory.\n-      out_args : dictionary of output arguments.\n-\n-    Returns:\n-        str : file name prefix.\n-    """\n-    # Determine output directory\n-    if not out_args[\'out_dir\']:\n-        out_dir = os.path.dirname(os.path.abspath(aligner_output))\n-    else:\n-        out_dir = os.path.abspath(out_args[\'out_dir\'])\n-        if not os.path.exists(out_dir):\n-            os.mkdir(out_dir)\n-\n-    # Determine file prefix\n-    if out_args[\'out_name\']:\n-        file_prefix = out_args[\'out_name\']\n-    else:\n-        file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]\n-\n-    return os.path.join(out_dir, file_prefix)\n-\n-\n-def getSeqDict(seq_file):\n-    """\n-    Create a dictionary from a sequence file.\n-\n-    Arguments:\n-      seq_file : sequence file.\n-\n-    Returns:\n-        dict : sequence description as keys with Bio.SeqRecords as values.\n-    """\n-    seq_dict = SeqIO.to_dict(readSeqFile(seq_file),\n-                             key_function=lambda x: x.description)\n-\n-    return seq_dict\n-\n-\n-def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,\n-            out_args=default_out_args):\n-    """\n-    Writes tab-delimited database file in output directory.\n-    \n-    Arguments:\n-      db : a iterator of IgRecord objects containing alignment data.\n-      fields : a list of ordered field names to write.\n-      file_prefix : directory and prefix for CLIP tab-delim file.\n-      total_count : number of records (for progress bar).\n-      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.\n-      no_parse : if ID is to be parsed for pRESTO output with default delimiters.\n-      partial : if True put incomplete alignments in the pass file.\n-      out_args : common output argument dictionary from parseCommonArgs.\n-\n-    Returns:\n-      None\n-    """\n-    # Function to check for valid records strictly\n-    def _pass_strict(rec):\n-        valid = [rec.v_call and rec.v_call != \'None\',\n-                 rec.j_call and rec.j_call != \'None\',\n-                 rec.functional is not None,\n-                 rec.seq_vdj,\n-                 rec.junction]\n-        return all(valid)\n-\n-    # Function to check for valid records loosely\n-    def _pass_gentle(rec):\n-        valid = [rec.v_call and rec.v_call != \'None\',\n-                 rec.d_call and rec.d_call != \'None\',\n-                 rec.j_call and rec.j_call != \'None\']\n-        return any(valid)\n-\n-    # Set pass criteria\n-    _pass = _pass_gentle if partial else _pass_strict\n-\n-    # Define output file names\n-    pass_file = \'%s_db-pass.tab\' % file_prefix\n-    fail_file = \'%s_db-fail.tab\' % file_prefix\n-\n-    # Initiate handles, writers and counters\n-    pass_handle = None\n-    fail_handle = None\n-    pass_writer = None\n-    fail_writer = None\n-    start_time = time()\n-    rec_count = pass_count = fail_count = 0\n-\n-    # Validate and write output\n-    pr'..b'                                  help=\'Process iHMMune-Align output.\',\n-                                        description=\'Process iHMMune-Align output.\')\n-    parser_ihmm.add_argument(\'-i\', nargs=\'+\', action=\'store\', dest=\'aligner_outputs\',\n-                             required=True,\n-                             help=\'\'\'iHMMune-Align output file.\'\'\')\n-    parser_ihmm.add_argument(\'-r\', nargs=\'+\', action=\'store\', dest=\'repo\', required=True,\n-                             help=\'\'\'List of folders and/or FASTA files containing\n-                                  IMGT-gapped germline sequences corresponding to the\n-                                  set of germlines used in the IgBLAST alignment.\'\'\')\n-    parser_ihmm.add_argument(\'-s\', action=\'store\', nargs=\'+\', dest=\'seq_files\',\n-                             required=True,\n-                             help=\'\'\'List of input FASTA files (with .fasta, .fna or .fa\n-                                  extension) containing sequences.\'\'\')\n-    parser_ihmm.add_argument(\'--noparse\', action=\'store_true\', dest=\'no_parse\',\n-                             help=\'\'\'Specify to prevent input sequence headers from being parsed\n-                                  to add new columns to database. Parsing of sequence headers requires\n-                                  headers to be in the pRESTO annotation format, so this should be specified\n-                                  when sequence headers are incompatible with the pRESTO annotation scheme.\n-                                  Note, unrecognized header formats will default to this behavior.\'\'\')\n-    parser_ihmm.add_argument(\'--partial\', action=\'store_true\', dest=\'partial\',\n-                             help=\'\'\'If specified, include incomplete V(D)J alignments in\n-                                  the pass file instead of the fail file.\'\'\')\n-    parser_ihmm.add_argument(\'--scores\', action=\'store_true\', dest=\'parse_scores\',\n-                             help=\'\'\'Specify if alignment score metrics should be\n-                                  included in the output. Adds the path score of the\n-                                  iHMMune-Align hidden Markov model to HMM_SCORE.\'\'\')\n-    parser_ihmm.add_argument(\'--regions\', action=\'store_true\', dest=\'parse_regions\',\n-                             help=\'\'\'Specify if IMGT FWRs and CDRs should be\n-                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n-                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n-                                  CDR3_IMGT columns.\'\'\')\n-    parser_ihmm.set_defaults(func=parseIHMM)\n-\n-    return parser\n-    \n-    \n-if __name__ == "__main__":\n-    """\n-    Parses command line arguments and calls main\n-    """\n-    parser = getArgParser()    \n-    args = parser.parse_args()\n-    args_dict = parseCommonArgs(args, in_arg=\'aligner_outputs\')\n-\n-    # Set no ID parsing if sequence files are not provided\n-    if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n-        args_dict[\'no_parse\'] = True\n-\n-    # Delete\n-    if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n-    if \'aligner_outputs\' in args_dict: del args_dict[\'aligner_outputs\']\n-    if \'command\' in args_dict: del args_dict[\'command\']\n-    if \'func\' in args_dict: del args_dict[\'func\']           \n-    \n-    if args.command == \'imgt\':\n-        for i in range(len(args.__dict__[\'aligner_outputs\'])):\n-            args_dict[\'aligner_output\'] = args.__dict__[\'aligner_outputs\'][i]\n-            args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n-                                    if args.__dict__[\'seq_files\'] else None\n-            args.func(**args_dict)\n-    elif args.command == \'igblast\' or args.command == \'ihmm\':\n-        for i in range(len(args.__dict__[\'aligner_outputs\'])):\n-            args_dict[\'aligner_output\'] =  args.__dict__[\'aligner_outputs\'][i]\n-            args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i]\n-            args.func(**args_dict)\n'
b
diff -r a93136637bea -r 58d2377b507d merge_and_filter.r
--- a/merge_and_filter.r Tue Jun 18 04:47:44 2019 -0400
+++ b/merge_and_filter.r Wed Jun 19 04:31:44 2019 -0400
[
@@ -53,6 +53,10 @@
 hotspots = fix_column_names(hotspots)
 AAs = fix_column_names(AAs)
 
+if(!("Sequence.number" %in% names(summ))){ 
+ summ["Sequence.number"] = 1:nrow(summ)
+}
+
 if(method == "blastn"){
  #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
  gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -140,9 +144,6 @@
 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
 result = merge(result, sequences, by="Sequence.ID", all.x=T)
 
-print("sequences files columns")
-print("CDR3.IMGT")
-
 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
 result = merge(result, AAs, by="Sequence.ID", all.x=T)
@@ -172,10 +173,17 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
+missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
+missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
+missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
+missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
+missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
+
+print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
+print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
+print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
 
 if(empty.region.filter == "leader"){
  result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
@@ -217,6 +225,7 @@
 
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
+
 if(filter.unique != "no"){
  clmns = names(result)
  if(filter.unique == "remove_vjaa"){
@@ -291,6 +300,9 @@
 
 matched.sequences.count = nrow(matched.sequences)
 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
+if(matched.sequences.count <= unmatched.sequences.count){
+ print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
+}
 
 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
b
diff -r a93136637bea -r 58d2377b507d shm_csr.xml
--- a/shm_csr.xml Tue Jun 18 04:47:44 2019 -0400
+++ b/shm_csr.xml Wed Jun 19 04:31:44 2019 -0400
b
@@ -8,7 +8,7 @@
  <requirement type="package" version="0.5.0">r-scales</requirement>
  <requirement type="package" version="3.4_5">r-seqinr</requirement>
  <requirement type="package" version="1.11.4">r-data.table</requirement>
- <requirement type="package" version="0.4.5">changeo</requirement>
+ <!--<requirement type="package" version="0.4.5">changeo</requirement>-->
  </requirements>
  <command interpreter="bash">
  #if str ( $filter_unique.filter_unique_select ) == "remove":