Repository 'mismatch_frequencies'
hg clone https://toolshed.g2.bx.psu.edu/repos/mvdbeek/mismatch_frequencies

Changeset 2:2974c382105c (2018-12-22)
Previous changeset 1:3613460e891e (2016-03-23)
Commit message:
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
modified:
mismatch_frequencies.py
mismatch_frequencies.xml
removed:
tool_dependencies.xml
b
diff -r 3613460e891e -r 2974c382105c mismatch_frequencies.py
--- a/mismatch_frequencies.py Wed Mar 23 09:59:33 2016 -0400
+++ b/mismatch_frequencies.py Sat Dec 22 04:15:47 2018 -0500
[
b'@@ -1,28 +1,33 @@\n-import pysam, re, string\n-import matplotlib.pyplot as plt\n+import re\n+import string\n+import pysam\n+import matplotlib\n import pandas as pd\n-import json\n from collections import defaultdict\n from collections import OrderedDict\n import argparse\n import itertools\n \n+matplotlib.use(\'pdf\')\n+import matplotlib.pyplot as plt  # noqa: E402\n+\n+\n class MismatchFrequencies:\n     \'\'\'Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One\n     class instance per alignment file. The result_dict attribute will contain a\n     nested dictionary with name, readlength and mismatch count.\'\'\'\n-    def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, \n+    def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21,\n                  maximal_readlength=21,\n-                 number_of_allowed_mismatches=1, \n-                 ignore_5p_nucleotides=0, \n+                 number_of_allowed_mismatches=1,\n+                 ignore_5p_nucleotides=0,\n                  ignore_3p_nucleotides=0,\n-                 possible_mismatches = [\n+                 possible_mismatches=[\n                         \'AC\', \'AG\', \'AT\',\n                         \'CA\', \'CG\', \'CT\',\n                         \'GA\', \'GC\', \'GT\',\n                         \'TA\', \'TC\', \'TG\'\n                 ]):\n-    \n+\n         self.result_dict = result_dict\n         self.name = name\n         self.minimal_readlength = minimal_readlength\n@@ -31,20 +36,19 @@\n         self.ignore_5p_nucleotides = ignore_5p_nucleotides\n         self.ignore_3p_nucleotides = ignore_3p_nucleotides\n         self.possible_mismatches = possible_mismatches\n-        \n+\n         if alignment_file:\n             self.pysam_alignment = pysam.Samfile(alignment_file)\n-            self.references = self.pysam_alignment.references #names of fasta reference sequences\n-            result_dict[name]=self.get_mismatches(\n-                self.pysam_alignment, \n-                minimal_readlength, \n+            self.references = self.pysam_alignment.references  # names of fasta reference sequences\n+            result_dict[name] = self.get_mismatches(\n+                self.pysam_alignment,\n+                minimal_readlength,\n                 maximal_readlength,\n                 possible_mismatches\n             )\n-    \n-    def get_mismatches(self, pysam_alignment, minimal_readlength, \n+\n+    def get_mismatches(self, pysam_alignment, minimal_readlength,\n                        maximal_readlength, possible_mismatches):\n-        mismatch_dict = defaultdict(int)\n         rec_dd = lambda: defaultdict(rec_dd)\n         len_dict = rec_dd()\n         for alignedread in pysam_alignment:\n@@ -56,8 +60,8 @@\n                     len_dict[int(alignedread.rlen)][chromosome][\'total valid reads\'] = 1\n                 MD = alignedread.opt(\'MD\')\n                 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):\n-                    (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)\n-                    if ref_base == None:\n+                    (ref_base, mismatch_base) = self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)\n+                    if not ref_base:\n                             continue\n                     else:\n                         for i, base in enumerate(ref_base):\n@@ -68,7 +72,7 @@\n                             except TypeError:\n                                 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1\n         return len_dict\n-    \n+\n     def read_is_valid(self, read, min_readlength, max_readlength):\n         \'\'\'Filter out reads that are unmatched, too short or\n         too long or that contian insertions\'\'\'\n@@ -80,17 +84,17 @@\n             return False\n         else:\n             return True\n-    \n+\n     def read_has_mismatch(self, read, number_of_allowed_mismatches=1):\n         \'\'\'keep only reads with one m'..b'        \'name\': name,\n+                \'minimal_readlength\': args.min,\n+                \'maximal_readlength\': args.max,\n+                \'number_of_allowed_mismatches\': args.n_mm,\n+                \'ignore_5p_nucleotides\': args.five_p,\n+                \'ignore_3p_nucleotides\': args.three_p,\n+                \'possible_mismatches\': args.possible_mismatches}\n+               for alignment_file, name in zip(args.input, args.name)]\n     return (kw_list, resultDict)\n \n+\n def nested_dict_to_df(dictionary):\n     dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()}\n-    df=pd.DataFrame.from_dict(dictionary).transpose()\n+    df = pd.DataFrame.from_dict(dictionary).transpose()\n     df.index.names = [\'Library\', \'Readlength\']\n     return df\n \n+\n def run_MismatchFrequencies(args):\n-    kw_list, resultDict=setup_MismatchFrequencies(args)\n+    kw_list, resultDict = setup_MismatchFrequencies(args)\n     references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list]\n     return (resultDict, references[0])\n \n+\n def main():\n     result_dict, references = run_MismatchFrequencies(args)\n     df = format_result_dict(result_dict, references, args.possible_mismatches)\n@@ -273,12 +286,12 @@\n     plot_result(reduced_dict, args)\n     reduced_df = nested_dict_to_df(reduced_dict)\n     df_to_tab(reduced_df, args.output_tab)\n-    if not args.expanded_output_tab == None:\n+    if args.expanded_output_tab:\n         df_to_tab(df, args.expanded_output_tab)\n     return reduced_dict\n \n if __name__ == "__main__":\n-    \n+\n     parser = argparse.ArgumentParser(description=\'Produce mismatch statistics for BAM/SAM alignment files.\')\n     parser.add_argument(\'--input\', nargs=\'*\', help=\'Input files in SAM/BAM format\')\n     parser.add_argument(\'--name\', nargs=\'*\', help=\'Name for input file to display in output file. Should have same length as the number of inputs\')\n@@ -286,15 +299,13 @@\n     parser.add_argument(\'--output_tab\', help=\'Output filename for table\')\n     parser.add_argument(\'--expanded_output_tab\', default=None, help=\'Output filename for table\')\n     parser.add_argument(\'--possible_mismatches\', default=[\n-            \'AC\', \'AG\', \'AT\',\'CA\', \'CG\', \'CT\', \'GA\', \'GC\', \'GT\', \'TA\', \'TC\', \'TG\'\n+            \'AC\', \'AG\', \'AT\', \'CA\', \'CG\', \'CT\', \'GA\', \'GC\', \'GT\', \'TA\', \'TC\', \'TG\'\n         ], nargs=\'+\', help=\'specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.\')\n     parser.add_argument(\'--min\', \'--minimal_readlength\', type=int, help=\'minimum readlength\')\n     parser.add_argument(\'--max\', \'--maximal_readlength\', type=int, help=\'maximum readlength\')\n     parser.add_argument(\'--n_mm\', \'--number_allowed_mismatches\', type=int, default=1, help=\'discard reads with more than n mismatches\')\n     parser.add_argument(\'--five_p\', \'--ignore_5p_nucleotides\', type=int, default=0, help=\'when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read\')\n     parser.add_argument(\'--three_p\', \'--ignore_3p_nucleotides\', type=int, default=1, help=\'when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read\')\n-    #args = parser.parse_args([\'--input\', \'3mismatches_ago2ip_s2.bam\', \'3mismatches_ago2ip_ovary.bam\',\'--possible_mismatches\',\'AC\',\'AG\', \'CG\', \'TG\', \'CT\',\'--name\', \'Siomi1\', \'Siomi2\' , \'--five_p\', \'3\',\'--three_p\',\'3\',\'--output_pdf\', \'out.pdf\', \'--output_tab\', \'out.tab\', \'--expanded_output_tab\', \'expanded.tab\', \'--min\', \'20\', \'--max\', \'22\'])\n+    # args = parser.parse_args([\'--input\', \'3mismatches_ago2ip_s2.bam\', \'3mismatches_ago2ip_ovary.bam\',\'--possible_mismatches\',\'AC\',\'AG\', \'CG\', \'TG\', \'CT\',\'--name\', \'Siomi1\', \'Siomi2\' , \'--five_p\', \'3\',\'--three_p\',\'3\',\'--output_pdf\', \'out.pdf\', \'--output_tab\', \'out.tab\', \'--expanded_output_tab\', \'expanded.tab\', \'--min\', \'20\', \'--max\', \'22\'])\n     args = parser.parse_args()\n     reduced_dict = main()\n-\n-\n'
b
diff -r 3613460e891e -r 2974c382105c mismatch_frequencies.xml
--- a/mismatch_frequencies.xml Wed Mar 23 09:59:33 2016 -0400
+++ b/mismatch_frequencies.xml Sat Dec 22 04:15:47 2018 -0500
[
@@ -1,25 +1,29 @@
 <tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.1.0" hidden="false" >
   <description>Analyze mismatch frequencies in BAM/SAM alignments</description>
   <requirements>
-    <requirement type="package" version="0.7.7">pysam</requirement>
-    <requirement type="package" version="0.14.1">pandas</requirement>
-    <requirement type="package" version="1.2.1">matplotlib</requirement>
+    <requirement type="package" version="0.8.3">pysam</requirement>
+    <requirement type="package" version="0.19.0">pandas</requirement>
+    <requirement type="package" version="1.5.3">matplotlib</requirement>
   </requirements>
-  <command interpreter="python">mismatch_frequencies.py --input 
- #for i in $rep
- "$i.input_file" 
- #end for
- --name 
- #for i in $rep
- "$i.input_file.element_identifier"
- #end for
-  --output_pdf $output_pdf --output_tab $output_tab --min $min_length --max $max_length
-                 --n_mm $number_of_mismatches
-                 --five_p $five_p
-                 --three_p $three_p
-                 --expanded_output_tab $expanded_tab
-                 --possible_mismatches $possible_mismatches
-  </command>
+  <command detect_errors="aggressive"><![CDATA[
+      python '$__tool_directory__'/mismatch_frequencies.py --input
+        #for i in $rep
+            "$i.input_file"
+        #end for
+        --name
+        #for i in $rep
+            "$i.input_file.element_identifier"
+        #end for
+        --output_pdf '$output_pdf'
+        --output_tab '$output_tab'
+        --min $min_length
+        --max $max_length
+        --n_mm $number_of_mismatches
+        --five_p $five_p
+        --three_p $three_p
+        --expanded_output_tab '$expanded_tab'
+        --possible_mismatches $possible_mismatches
+  ]]></command>
   <inputs>
     <repeat name="rep" title="alignment files">
       <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/>
@@ -33,7 +37,7 @@
     <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/>
     <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/>
     <param help="Output expanded tabular format" label="Nucleotide mismatches per reference sequence" name="expanded" type="select">
-        <option select="true" value="false">No</option>
+        <option selected="true" value="false">No</option>
         <option value="expanded">Yes</option>
     </param>
   </inputs>
@@ -66,12 +70,13 @@
 
 ***What it does***
 
-This tool reconstitues for each aligned read of an alignment file in SAM/BAM format whether
-a mismatch is annotated in the MD tag, and if that is the case counts the identity of the 
-mismatch relative to the reference sequence. The output is a PDF document with the calculated
-frequency for each mismatch that occured relative to the total number of valid reads and a table
-with the corresponding values. Read length can be limited to a specific read length, and 5 prime and 
-3 prime-most nucleotides of a read can be ignored.
+This tool reconstitues for each aligned read of an alignment file in SAM/BAM
+format whether a mismatch is annotated in the MD tag, and if that is the case
+counts the identity of the mismatch relative to the reference sequence. The
+output is a PDF document with the calculated frequency for each mismatch that
+occured relative to the total number of valid reads and a table with the
+corresponding values. Read length can be limited to a specific read length, and
+5 prime and 3 prime-most nucleotides of a read can be ignored.
 
 ----
 
b
diff -r 3613460e891e -r 2974c382105c tool_dependencies.xml
--- a/tool_dependencies.xml Wed Mar 23 09:59:33 2016 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,12 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="pysam" version="0.7.7">
-        <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="pandas" version="0.14.1">
-        <repository changeset_revision="ac9f317487a9" name="package_pandas_0_14" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="matplotlib" version="1.2.1">
-        <repository changeset_revision="48020985e28c" name="package_matplotlib_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>