Previous changeset 1:3613460e891e (2016-03-23) |
Commit message:
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty |
modified:
mismatch_frequencies.py mismatch_frequencies.xml |
removed:
tool_dependencies.xml |
b |
diff -r 3613460e891e -r 2974c382105c mismatch_frequencies.py --- a/mismatch_frequencies.py Wed Mar 23 09:59:33 2016 -0400 +++ b/mismatch_frequencies.py Sat Dec 22 04:15:47 2018 -0500 |
[ |
b'@@ -1,28 +1,33 @@\n-import pysam, re, string\n-import matplotlib.pyplot as plt\n+import re\n+import string\n+import pysam\n+import matplotlib\n import pandas as pd\n-import json\n from collections import defaultdict\n from collections import OrderedDict\n import argparse\n import itertools\n \n+matplotlib.use(\'pdf\')\n+import matplotlib.pyplot as plt # noqa: E402\n+\n+\n class MismatchFrequencies:\n \'\'\'Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One\n class instance per alignment file. The result_dict attribute will contain a\n nested dictionary with name, readlength and mismatch count.\'\'\'\n- def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, \n+ def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21,\n maximal_readlength=21,\n- number_of_allowed_mismatches=1, \n- ignore_5p_nucleotides=0, \n+ number_of_allowed_mismatches=1,\n+ ignore_5p_nucleotides=0,\n ignore_3p_nucleotides=0,\n- possible_mismatches = [\n+ possible_mismatches=[\n \'AC\', \'AG\', \'AT\',\n \'CA\', \'CG\', \'CT\',\n \'GA\', \'GC\', \'GT\',\n \'TA\', \'TC\', \'TG\'\n ]):\n- \n+\n self.result_dict = result_dict\n self.name = name\n self.minimal_readlength = minimal_readlength\n@@ -31,20 +36,19 @@\n self.ignore_5p_nucleotides = ignore_5p_nucleotides\n self.ignore_3p_nucleotides = ignore_3p_nucleotides\n self.possible_mismatches = possible_mismatches\n- \n+\n if alignment_file:\n self.pysam_alignment = pysam.Samfile(alignment_file)\n- self.references = self.pysam_alignment.references #names of fasta reference sequences\n- result_dict[name]=self.get_mismatches(\n- self.pysam_alignment, \n- minimal_readlength, \n+ self.references = self.pysam_alignment.references # names of fasta reference sequences\n+ result_dict[name] = self.get_mismatches(\n+ self.pysam_alignment,\n+ minimal_readlength,\n maximal_readlength,\n possible_mismatches\n )\n- \n- def get_mismatches(self, pysam_alignment, minimal_readlength, \n+\n+ def get_mismatches(self, pysam_alignment, minimal_readlength,\n maximal_readlength, possible_mismatches):\n- mismatch_dict = defaultdict(int)\n rec_dd = lambda: defaultdict(rec_dd)\n len_dict = rec_dd()\n for alignedread in pysam_alignment:\n@@ -56,8 +60,8 @@\n len_dict[int(alignedread.rlen)][chromosome][\'total valid reads\'] = 1\n MD = alignedread.opt(\'MD\')\n if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):\n- (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)\n- if ref_base == None:\n+ (ref_base, mismatch_base) = self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)\n+ if not ref_base:\n continue\n else:\n for i, base in enumerate(ref_base):\n@@ -68,7 +72,7 @@\n except TypeError:\n len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1\n return len_dict\n- \n+\n def read_is_valid(self, read, min_readlength, max_readlength):\n \'\'\'Filter out reads that are unmatched, too short or\n too long or that contian insertions\'\'\'\n@@ -80,17 +84,17 @@\n return False\n else:\n return True\n- \n+\n def read_has_mismatch(self, read, number_of_allowed_mismatches=1):\n \'\'\'keep only reads with one m'..b' \'name\': name,\n+ \'minimal_readlength\': args.min,\n+ \'maximal_readlength\': args.max,\n+ \'number_of_allowed_mismatches\': args.n_mm,\n+ \'ignore_5p_nucleotides\': args.five_p,\n+ \'ignore_3p_nucleotides\': args.three_p,\n+ \'possible_mismatches\': args.possible_mismatches}\n+ for alignment_file, name in zip(args.input, args.name)]\n return (kw_list, resultDict)\n \n+\n def nested_dict_to_df(dictionary):\n dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()}\n- df=pd.DataFrame.from_dict(dictionary).transpose()\n+ df = pd.DataFrame.from_dict(dictionary).transpose()\n df.index.names = [\'Library\', \'Readlength\']\n return df\n \n+\n def run_MismatchFrequencies(args):\n- kw_list, resultDict=setup_MismatchFrequencies(args)\n+ kw_list, resultDict = setup_MismatchFrequencies(args)\n references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list]\n return (resultDict, references[0])\n \n+\n def main():\n result_dict, references = run_MismatchFrequencies(args)\n df = format_result_dict(result_dict, references, args.possible_mismatches)\n@@ -273,12 +286,12 @@\n plot_result(reduced_dict, args)\n reduced_df = nested_dict_to_df(reduced_dict)\n df_to_tab(reduced_df, args.output_tab)\n- if not args.expanded_output_tab == None:\n+ if args.expanded_output_tab:\n df_to_tab(df, args.expanded_output_tab)\n return reduced_dict\n \n if __name__ == "__main__":\n- \n+\n parser = argparse.ArgumentParser(description=\'Produce mismatch statistics for BAM/SAM alignment files.\')\n parser.add_argument(\'--input\', nargs=\'*\', help=\'Input files in SAM/BAM format\')\n parser.add_argument(\'--name\', nargs=\'*\', help=\'Name for input file to display in output file. Should have same length as the number of inputs\')\n@@ -286,15 +299,13 @@\n parser.add_argument(\'--output_tab\', help=\'Output filename for table\')\n parser.add_argument(\'--expanded_output_tab\', default=None, help=\'Output filename for table\')\n parser.add_argument(\'--possible_mismatches\', default=[\n- \'AC\', \'AG\', \'AT\',\'CA\', \'CG\', \'CT\', \'GA\', \'GC\', \'GT\', \'TA\', \'TC\', \'TG\'\n+ \'AC\', \'AG\', \'AT\', \'CA\', \'CG\', \'CT\', \'GA\', \'GC\', \'GT\', \'TA\', \'TC\', \'TG\'\n ], nargs=\'+\', help=\'specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.\')\n parser.add_argument(\'--min\', \'--minimal_readlength\', type=int, help=\'minimum readlength\')\n parser.add_argument(\'--max\', \'--maximal_readlength\', type=int, help=\'maximum readlength\')\n parser.add_argument(\'--n_mm\', \'--number_allowed_mismatches\', type=int, default=1, help=\'discard reads with more than n mismatches\')\n parser.add_argument(\'--five_p\', \'--ignore_5p_nucleotides\', type=int, default=0, help=\'when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read\')\n parser.add_argument(\'--three_p\', \'--ignore_3p_nucleotides\', type=int, default=1, help=\'when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read\')\n- #args = parser.parse_args([\'--input\', \'3mismatches_ago2ip_s2.bam\', \'3mismatches_ago2ip_ovary.bam\',\'--possible_mismatches\',\'AC\',\'AG\', \'CG\', \'TG\', \'CT\',\'--name\', \'Siomi1\', \'Siomi2\' , \'--five_p\', \'3\',\'--three_p\',\'3\',\'--output_pdf\', \'out.pdf\', \'--output_tab\', \'out.tab\', \'--expanded_output_tab\', \'expanded.tab\', \'--min\', \'20\', \'--max\', \'22\'])\n+ # args = parser.parse_args([\'--input\', \'3mismatches_ago2ip_s2.bam\', \'3mismatches_ago2ip_ovary.bam\',\'--possible_mismatches\',\'AC\',\'AG\', \'CG\', \'TG\', \'CT\',\'--name\', \'Siomi1\', \'Siomi2\' , \'--five_p\', \'3\',\'--three_p\',\'3\',\'--output_pdf\', \'out.pdf\', \'--output_tab\', \'out.tab\', \'--expanded_output_tab\', \'expanded.tab\', \'--min\', \'20\', \'--max\', \'22\'])\n args = parser.parse_args()\n reduced_dict = main()\n-\n-\n' |
b |
diff -r 3613460e891e -r 2974c382105c mismatch_frequencies.xml --- a/mismatch_frequencies.xml Wed Mar 23 09:59:33 2016 -0400 +++ b/mismatch_frequencies.xml Sat Dec 22 04:15:47 2018 -0500 |
[ |
@@ -1,25 +1,29 @@ <tool id="mismatch_frequencies" name="Mismatch Frequencies" version="0.1.0" hidden="false" > <description>Analyze mismatch frequencies in BAM/SAM alignments</description> <requirements> - <requirement type="package" version="0.7.7">pysam</requirement> - <requirement type="package" version="0.14.1">pandas</requirement> - <requirement type="package" version="1.2.1">matplotlib</requirement> + <requirement type="package" version="0.8.3">pysam</requirement> + <requirement type="package" version="0.19.0">pandas</requirement> + <requirement type="package" version="1.5.3">matplotlib</requirement> </requirements> - <command interpreter="python">mismatch_frequencies.py --input - #for i in $rep - "$i.input_file" - #end for - --name - #for i in $rep - "$i.input_file.element_identifier" - #end for - --output_pdf $output_pdf --output_tab $output_tab --min $min_length --max $max_length - --n_mm $number_of_mismatches - --five_p $five_p - --three_p $three_p - --expanded_output_tab $expanded_tab - --possible_mismatches $possible_mismatches - </command> + <command detect_errors="aggressive"><![CDATA[ + python '$__tool_directory__'/mismatch_frequencies.py --input + #for i in $rep + "$i.input_file" + #end for + --name + #for i in $rep + "$i.input_file.element_identifier" + #end for + --output_pdf '$output_pdf' + --output_tab '$output_tab' + --min $min_length + --max $max_length + --n_mm $number_of_mismatches + --five_p $five_p + --three_p $three_p + --expanded_output_tab '$expanded_tab' + --possible_mismatches $possible_mismatches + ]]></command> <inputs> <repeat name="rep" title="alignment files"> <param name="input_file" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) for which to analyze the mismatches."/> @@ -33,7 +37,7 @@ <param name="five_p" label="Ignore mismatches in the first N nucleotides of a read" type="integer" value="0"/> <param name="three_p" label="Ignore mismatches in the last N nucleotides of a read" help="useful to discriminate between tailing events and editing events" type="integer" value="3"/> <param help="Output expanded tabular format" label="Nucleotide mismatches per reference sequence" name="expanded" type="select"> - <option select="true" value="false">No</option> + <option selected="true" value="false">No</option> <option value="expanded">Yes</option> </param> </inputs> @@ -66,12 +70,13 @@ ***What it does*** -This tool reconstitues for each aligned read of an alignment file in SAM/BAM format whether -a mismatch is annotated in the MD tag, and if that is the case counts the identity of the -mismatch relative to the reference sequence. The output is a PDF document with the calculated -frequency for each mismatch that occured relative to the total number of valid reads and a table -with the corresponding values. Read length can be limited to a specific read length, and 5 prime and -3 prime-most nucleotides of a read can be ignored. +This tool reconstitues for each aligned read of an alignment file in SAM/BAM +format whether a mismatch is annotated in the MD tag, and if that is the case +counts the identity of the mismatch relative to the reference sequence. The +output is a PDF document with the calculated frequency for each mismatch that +occured relative to the total number of valid reads and a table with the +corresponding values. Read length can be limited to a specific read length, and +5 prime and 3 prime-most nucleotides of a read can be ignored. ---- |
b |
diff -r 3613460e891e -r 2974c382105c tool_dependencies.xml --- a/tool_dependencies.xml Wed Mar 23 09:59:33 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,12 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="pysam" version="0.7.7"> - <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="pandas" version="0.14.1"> - <repository changeset_revision="ac9f317487a9" name="package_pandas_0_14" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="matplotlib" version="1.2.1"> - <repository changeset_revision="48020985e28c" name="package_matplotlib_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency> |