Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
changeset 7:3c05abb4452e default tip
add missing files
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Galaxy-Workflow-Estimate_minimum_informative_read_depth.ga Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,342 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "Estimate minimum informative read depth", + "steps": { + "0": { + "annotation": "See sample in https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "See sample in https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files", + "name": "TR error rate" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 220, + "top": 737 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"TR error rate\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "1": { + "annotation": "replace 'A' with motif of interest", + "id": 1, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "name": "Filter", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 385, + "top": 260 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Filter1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"header_lines\": \"\\\"0\\\"\", \"cond\": \"\\\"c4=='A'\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "error_profile": { + "id": 1, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Generate all possible combination of read profile", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 653.5, + "top": 203 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/Profilegenerator/2.0.0", + "tool_state": "{\"error_profile\": \"null\", \"MOTIF\": \"\\\"A\\\"\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"minprob\": \"\\\"1e-08\\\"\", \"Maxdepth\": \"\\\"30\\\"\"}", + "tool_version": "2.0.0", + "type": "tool", + "user_outputs": [] + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "input": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [], + "name": "Unique", + "outputs": [ + { + "name": "outfile", + "type": "input" + } + ], + "position": { + "left": 676, + "top": 322 + }, + "post_job_actions": { + "HideDatasetActionoutfile": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "outfile" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/unique/bg_uniq/0.3", + "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"adv_opts\": \"{\\\"adv_opts_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"is_numeric\": \"\\\"False\\\"\", \"input\": \"null\"}", + "tool_version": "0.3", + "type": "tool", + "user_outputs": [] + }, + "4": { + "annotation": "", + "id": 4, + "input_connections": { + "microsat_error_profile": { + "id": 0, + "output_name": "output" + }, + "microsat_raw": { + "id": 3, + "output_name": "outfile" + } + }, + "inputs": [], + "name": "Correct genotype for microsatellite errors", + "outputs": [ + { + "name": "microsat_corrected", + "type": "tabular" + } + ], + "position": { + "left": 661, + "top": 427 + }, + "post_job_actions": { + "HideDatasetActionmicrosat_corrected": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "microsat_corrected" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/GenotypeSTR/2.0.0", + "tool_state": "{\"microsat_raw\": \"null\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"microsat_error_profile\": \"null\", \"expectedminorallele\": \"\\\"0.5\\\"\"}", + "tool_version": "2.0.0", + "type": "tool", + "user_outputs": [] + }, + "5": { + "annotation": "", + "id": 5, + "input_connections": { + "input": { + "id": 4, + "output_name": "microsat_corrected" + } + }, + "inputs": [], + "name": "Select", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 732, + "top": 629 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Grep1", + "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"invert\": \"\\\"false\\\"\", \"__rerun_remap_job_id__\": null, \"pattern\": \"\\\"hetero\\\"\"}", + "tool_version": "1.0.1", + "type": "tool", + "user_outputs": [] + }, + "6": { + "annotation": "replace 1 with motif size. For example, if motif is AAG, use 3", + "id": 6, + "input_connections": { + "input": { + "id": 5, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Filter", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 746, + "top": 783 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Filter1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"header_lines\": \"\\\"0\\\"\", \"cond\": \"\\\"abs(c7-c8)==1\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "7": { + "annotation": "", + "id": 7, + "input_connections": { + "microsat_error_profile": { + "id": 0, + "output_name": "output" + }, + "microsat_raw": { + "id": 6, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Evaluate the probability of the allele combination to generate read profile", + "outputs": [ + { + "name": "microsat_corrected", + "type": "tabular" + } + ], + "position": { + "left": 708.5, + "top": 913 + }, + "post_job_actions": { + "HideDatasetActionmicrosat_corrected": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "microsat_corrected" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/heteroprob/2.0.0", + "tool_state": "{\"microsat_raw\": \"null\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"microsat_error_profile\": \"null\", \"expectedminorallele\": \"\\\"0.5\\\"\"}", + "tool_version": "2.0.0", + "type": "tool", + "user_outputs": [] + }, + "8": { + "annotation": "", + "id": 8, + "input_connections": { + "input": { + "id": 7, + "output_name": "microsat_corrected" + } + }, + "inputs": [], + "name": "Sort", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 761, + "top": 1133 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "sort1", + "tool_state": "{\"__page__\": 0, \"style\": \"\\\"num\\\"\", \"column\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"12\\\"}\", \"__rerun_remap_job_id__\": null, \"order\": \"\\\"ASC\\\"\", \"input\": \"null\", \"column_set\": \"[{\\\"other_order\\\": \\\"ASC\\\", \\\"__index__\\\": 0, \\\"other_column\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"6\\\"}, \\\"other_style\\\": \\\"num\\\"}]\"}", + "tool_version": "1.0.3", + "type": "tool", + "user_outputs": [] + }, + "9": { + "annotation": "", + "id": 9, + "input_connections": { + "input": { + "id": 8, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Combine probability to generate read profile ", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 722, + "top": 1264 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/combineproballelecom/2.0.0", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"__page__\": 0}", + "tool_version": "2.0.0", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Galaxy-Workflow-TR_genome_profiling.ga Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,191 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "TR genome profiling", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "fasta reference file" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 200, + "top": 250 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"fasta reference file\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": { + "filePath": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "name": "Microsatellite detection", + "outputs": [ + { + "name": "stdout", + "type": "tabular" + } + ], + "position": { + "left": 430, + "top": 250 + }, + "post_job_actions": { + "ChangeDatatypeActionstdout": { + "action_arguments": { + "newtype": "tabular" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "stdout" + }, + "HideDatasetActionstdout": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "stdout" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/microsatellite/1.0.0", + "tool_state": "{\"__page__\": 0, \"flankSetting\": \"{\\\"noflankdisplay\\\": \\\"False\\\", \\\"flankdisplay\\\": \\\"0\\\", \\\"__current_case__\\\": 0}\", \"filePath\": \"null\", \"minlength\": \"\\\"5\\\"\", \"hammingThreshold\": \"\\\"0\\\"\", \"period\": \"\\\"1\\\"\", \"surfix\": \"\\\"0\\\"\", \"prefix\": \"\\\"0\\\"\", \"__rerun_remap_job_id__\": null, \"inputFileSource\": \"{\\\"inputFileType\\\": \\\"fasta\\\", \\\"__current_case__\\\": 2}\", \"partialmotifs\": \"\\\"True\\\"\", \"multipleruns\": \"\\\"True\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "input": { + "id": 1, + "output_name": "stdout" + } + }, + "inputs": [], + "name": "Compute", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 688, + "top": 250 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Add_a_column1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"int(c1+c2)\\\"\", \"round\": \"\\\"no\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "input": { + "id": 2, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Compute", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 916, + "top": 250 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Add_a_column1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"len(c4)\\\"\", \"round\": \"\\\"no\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "4": { + "annotation": "", + "id": 4, + "input_connections": { + "input": { + "id": 3, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Cut", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 1144, + "top": 250 + }, + "post_job_actions": { + "ChangeDatatypeActionout_file1": { + "action_arguments": { + "newtype": "interval" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "out_file1" + }, + "ColumnSetActionout_file1": { + "action_arguments": { + "chromCol": "1", + "endCol": "3", + "nameCol": "", + "startCol": "2", + "strandCol": "" + }, + "action_type": "ColumnSetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Cut1", + "tool_state": "{\"columnList\": \"\\\"c6,c2,c9,c4,c1,c10\\\"\", \"input\": \"null\", \"delimiter\": \"\\\"T\\\"\", \"__rerun_remap_job_id__\": null, \"__page__\": 0}", + "tool_version": "1.0.2", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Galaxy-Workflow-microsatellite_profiling.ga Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,764 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "Profile length of microsatellites or short tandem repeats from short read data", + "format-version": "0.1", + "name": "microsatellite_profiling", + "steps": { + "0": { + "annotation": "input raw read that you want to detect Tandem reepats (TRs)/microsatellites", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "input raw read that you want to detect Tandem reepats (TRs)/microsatellites", + "name": "single end fastq" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 200, + "top": 274.765625 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"single end fastq\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "1": { + "annotation": "need to be prepared by user using TR genome profiling", + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "need to be prepared by user using TR genome profiling", + "name": "TR in genome" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 177, + "top": 412.953125 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"TR in genome\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "2": { + "annotation": "https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files", + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files", + "name": "error rate file" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 1412.875, + "top": 384.671875 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"error rate file\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "filePath": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "name": "Microsatellite detection", + "outputs": [ + { + "name": "stdout", + "type": "tabular" + } + ], + "position": { + "left": 441.953125, + "top": 252 + }, + "post_job_actions": { + "ChangeDatatypeActionstdout": { + "action_arguments": { + "newtype": "tabular" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "stdout" + }, + "RenameDatasetActionstdout": { + "action_arguments": { + "newname": "TR in raw reads" + }, + "action_type": "RenameDatasetAction", + "output_name": "stdout" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/microsatellite/1.0.0", + "tool_state": "{\"__page__\": 0, \"flankSetting\": \"{\\\"noflankdisplay\\\": \\\"True\\\", \\\"__current_case__\\\": 1}\", \"filePath\": \"null\", \"minlength\": \"\\\"5\\\"\", \"hammingThreshold\": \"\\\"0\\\"\", \"period\": \"\\\"1\\\"\", \"surfix\": \"\\\"20\\\"\", \"prefix\": \"\\\"20\\\"\", \"__rerun_remap_job_id__\": null, \"inputFileSource\": \"{\\\"inputFileType\\\": \\\"fastq\\\", \\\"__current_case__\\\": 3}\", \"partialmotifs\": \"\\\"True\\\"\", \"multipleruns\": \"\\\"True\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "4": { + "annotation": "", + "id": 4, + "input_connections": { + "input": { + "id": 3, + "output_name": "stdout" + } + }, + "inputs": [], + "name": "Read name modifier", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 418.484375, + "top": 385.03125 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/space2underscore_readname/1.0.0", + "tool_state": "{\"__page__\": 0, \"column_n\": \"\\\"6\\\"\", \"__rerun_remap_job_id__\": null, \"input\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "5": { + "annotation": "", + "id": 5, + "input_connections": { + "microsat_in_read": { + "id": 4, + "output_name": "output" + } + }, + "inputs": [], + "name": "Fetch flanking bases", + "outputs": [ + { + "name": "Leftflanking", + "type": "fastq" + }, + { + "name": "Rightflanking", + "type": "fastq" + } + ], + "position": { + "left": 390.9375, + "top": 509.046875 + }, + "post_job_actions": { + "ChangeDatatypeActionLeftflanking": { + "action_arguments": { + "newtype": "fastqsanger" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "Leftflanking" + }, + "ChangeDatatypeActionRightflanking": { + "action_arguments": { + "newtype": "fastqsanger" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "Rightflanking" + }, + "HideDatasetActionLeftflanking": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "Leftflanking" + }, + "HideDatasetActionRightflanking": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "Rightflanking" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/fetchflank/1.0.0", + "tool_state": "{\"qualitycutoff\": \"\\\"20\\\"\", \"lengthofbasetocheckquality\": \"\\\"20\\\"\", \"__rerun_remap_job_id__\": null, \"microsat_in_read\": \"null\", \"__page__\": 0}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "6": { + "annotation": "this tool has problems with parameter setting when you run it from pipeline. If you want different parameter, it's recommend that you clone pipleline and modify it.", + "id": 6, + "input_connections": { + "paired|input1": { + "id": 5, + "output_name": "Leftflanking" + }, + "paired|input2": { + "id": 5, + "output_name": "Rightflanking" + } + }, + "inputs": [], + "name": "Map with BWA for Illumina", + "outputs": [ + { + "name": "output", + "type": "sam" + } + ], + "position": { + "left": 403, + "top": 686 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/bwa_wrappers/bwa_wrapper/1.2.3", + "tool_state": "{\"genomeSource\": \"{\\\"refGenomeSource\\\": \\\"history\\\", \\\"ownFile\\\": null, \\\"__current_case__\\\": 1}\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"paired\": \"{\\\"input2\\\": null, \\\"sPaired\\\": \\\"paired\\\", \\\"input1\\\": null, \\\"__current_case__\\\": 1}\", \"params\": \"{\\\"outputTopN\\\": \\\"3\\\", \\\"suboptAlign\\\": \\\"\\\", \\\"gapOpenPenalty\\\": \\\"11\\\", \\\"maxGapOpens\\\": \\\"1\\\", \\\"mismatchPenalty\\\": \\\"3\\\", \\\"fracMissingAligns\\\": \\\"0.04\\\", \\\"disallowIndel\\\": \\\"5\\\", \\\"outputTopNDisc\\\": \\\"10\\\", \\\"disallowLongDel\\\": \\\"16\\\", \\\"readGroup\\\": {\\\"__current_case__\\\": 1, \\\"specReadGroup\\\": \\\"no\\\"}, \\\"maxInsertSize\\\": \\\"500\\\", \\\"__current_case__\\\": 1, \\\"maxEditDist\\\": \\\"0\\\", \\\"gapExtensPenalty\\\": \\\"4\\\", \\\"maxGapExtens\\\": \\\"-1\\\", \\\"maxEditDistSeed\\\": \\\"2\\\", \\\"noIterSearch\\\": \\\"False\\\", \\\"seed\\\": \\\"-1\\\", \\\"maxOccurPairing\\\": \\\"100000\\\", \\\"source_select\\\": \\\"full\\\"}\", \"suppressHeader\": \"\\\"False\\\"\"}", + "tool_version": "1.2.3", + "type": "tool", + "user_outputs": [] + }, + "7": { + "annotation": "", + "id": 7, + "input_connections": { + "input1": { + "id": 6, + "output_name": "output" + } + }, + "inputs": [], + "name": "Filter SAM", + "outputs": [ + { + "name": "out_file1", + "type": "sam" + } + ], + "position": { + "left": 778, + "top": 489.953125 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/sam_bitwise_flag_filter/sam_bw_filter/1.0.0", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input1\": \"null\", \"bits\": \"[{\\\"states\\\": \\\"0\\\", \\\"__index__\\\": 0, \\\"flags\\\": \\\"--0x0004\\\"}]\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "8": { + "annotation": "remove all mapped read with soft mask", + "id": 8, + "input_connections": { + "input": { + "id": 7, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Filter", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 781.5, + "top": 602 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Filter1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"header_lines\": \"\\\"0\\\"\", \"cond\": \"\\\"c6.count('S')==0\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "9": { + "annotation": "", + "id": 9, + "input_connections": { + "input": { + "id": 8, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Sort", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 777.5, + "top": 739 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "sort1", + "tool_state": "{\"__page__\": 0, \"style\": \"\\\"alpha\\\"\", \"column\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"__rerun_remap_job_id__\": null, \"order\": \"\\\"ASC\\\"\", \"input\": \"null\", \"column_set\": \"[]\"}", + "tool_version": "1.0.3", + "type": "tool", + "user_outputs": [] + }, + "10": { + "annotation": "", + "id": 10, + "input_connections": { + "flankedbasesSAM": { + "id": 9, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Combine mapped flaked bases", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 723.328125, + "top": 887.828125 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/PEsortedSAM2readprofile/1.0.0", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"maxoriginalreadlength\": \"\\\"101\\\"\", \"maxTRlength\": \"\\\"100\\\"\", \"flankedbasesSAM\": \"null\", \"twobitref\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "11": { + "annotation": "", + "id": 11, + "input_connections": { + "input1": { + "id": 4, + "output_name": "output" + }, + "input2": { + "id": 10, + "output_name": "output" + } + }, + "inputs": [], + "name": "Join two Datasets", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1116.375, + "top": 431.84375 + }, + "post_job_actions": { + "ColumnSetActionout_file1": { + "action_arguments": { + "chromCol": "10", + "endCol": "14", + "nameCol": "", + "startCol": "13", + "strandCol": "" + }, + "action_type": "ColumnSetAction", + "output_name": "out_file1" + }, + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "join1", + "tool_state": "{\"input2\": \"null\", \"__page__\": 0, \"field1\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"6\\\"}\", \"partial\": \"\\\"\\\"\", \"field2\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"__rerun_remap_job_id__\": null, \"fill_empty_columns\": \"{\\\"fill_empty_columns_switch\\\": \\\"no_fill\\\", \\\"__current_case__\\\": 0}\", \"unmatched\": \"\\\"\\\"\", \"input1\": \"null\"}", + "tool_version": "2.0.2", + "type": "tool", + "user_outputs": [] + }, + "12": { + "annotation": "", + "id": 12, + "input_connections": { + "input1": { + "id": 1, + "output_name": "output" + }, + "input2": { + "id": 11, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Join", + "outputs": [ + { + "name": "output", + "type": "interval" + } + ], + "position": { + "left": 1113.96875, + "top": 560.296875 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/join/gops_join_1/1.0.0", + "tool_state": "{\"input2\": \"null\", \"__page__\": 0, \"input1\": \"null\", \"min\": \"\\\"1\\\"\", \"__rerun_remap_job_id__\": null, \"fill\": \"\\\"none\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "13": { + "annotation": "", + "id": 13, + "input_connections": { + "input": { + "id": 12, + "output_name": "output" + } + }, + "inputs": [], + "name": "Check microsatellites motif compatibility", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 1122.328125, + "top": 710.84375 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/microsatcompat/1.0.0", + "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"__rerun_remap_job_id__\": null, \"column1\": \"\\\"4\\\"\", \"column2\": \"\\\"10\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "14": { + "annotation": "", + "id": 14, + "input_connections": { + "input": { + "id": 13, + "output_name": "output" + } + }, + "inputs": [], + "name": "Select uninterrupted microsatellites", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 1131.21875, + "top": 852.90625 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/microsatpurity/1.0.0", + "tool_state": "{\"__page__\": 0, \"column_n\": \"\\\"0\\\"\", \"__rerun_remap_job_id__\": null, \"period\": \"\\\"1\\\"\", \"input\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "15": { + "annotation": "", + "id": 15, + "input_connections": { + "input": { + "id": 14, + "output_name": "output" + } + }, + "inputs": [], + "name": "Cut", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 1161, + "top": 963.953125 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Cut1", + "tool_state": "{\"columnList\": \"\\\"c1,c2,c3,c4,c5,c7\\\"\", \"input\": \"null\", \"delimiter\": \"\\\"T\\\"\", \"__rerun_remap_job_id__\": null, \"__page__\": 0}", + "tool_version": "1.0.2", + "type": "tool", + "user_outputs": [] + }, + "16": { + "annotation": "", + "id": 16, + "input_connections": { + "input": { + "id": 15, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Add column", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1168, + "top": 1069.953125 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "addValue", + "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"__rerun_remap_job_id__\": null, \"exp\": \"\\\"_\\\"\", \"iterate\": \"\\\"no\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "17": { + "annotation": "", + "id": 17, + "input_connections": { + "input1": { + "id": 16, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Merge Columns", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 1175, + "top": 1184.953125 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "mergeCols1", + "tool_state": "{\"__page__\": 0, \"input1\": \"null\", \"__rerun_remap_job_id__\": null, \"col2\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"7\\\"}\", \"col1\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"columns\": \"[{\\\"__index__\\\": 0, \\\"datacol\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"2\\\"}}, {\\\"__index__\\\": 1, \\\"datacol\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"7\\\"}}, {\\\"__index__\\\": 2, \\\"datacol\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"3\\\"}}]\"}", + "tool_version": "1.0.1", + "type": "tool", + "user_outputs": [] + }, + "18": { + "annotation": "", + "id": 18, + "input_connections": { + "input1": { + "id": 17, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Group", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 1175, + "top": 1297.953125 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Grouping1", + "tool_state": "{\"operations\": \"[{\\\"opcol\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"6\\\"}, \\\"__index__\\\": 0, \\\"optype\\\": \\\"cat\\\", \\\"opround\\\": \\\"no\\\"}, {\\\"opcol\\\": {\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"4\\\"}, \\\"__index__\\\": 1, \\\"optype\\\": \\\"cat_uniq\\\", \\\"opround\\\": \\\"no\\\"}]\", \"__page__\": 0, \"input1\": \"null\", \"ignorelines\": \"null\", \"groupcol\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"8\\\"}\", \"__rerun_remap_job_id__\": null, \"ignorecase\": \"\\\"False\\\"\"}", + "tool_version": "2.1.0", + "type": "tool", + "user_outputs": [] + }, + "19": { + "annotation": "", + "id": 19, + "input_connections": { + "input": { + "id": 18, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Filter", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1186, + "top": 1405.953125 + }, + "post_job_actions": { + "RenameDatasetActionout_file1": { + "action_arguments": { + "newname": "TR profile" + }, + "action_type": "RenameDatasetAction", + "output_name": "out_file1" + } + }, + "tool_errors": null, + "tool_id": "Filter1", + "tool_state": "{\"input\": \"null\", \"__rerun_remap_job_id__\": null, \"header_lines\": \"\\\"0\\\"\", \"cond\": \"\\\"c2.count(\\\\\\\",\\\\\\\")>=4\\\"\", \"__page__\": 0}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "20": { + "annotation": "", + "id": 20, + "input_connections": { + "microsat_error_profile": { + "id": 2, + "output_name": "output" + }, + "microsat_raw": { + "id": 19, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Correct genotype for microsatellite errors", + "outputs": [ + { + "name": "microsat_corrected", + "type": "tabular" + } + ], + "position": { + "left": 1591.328125, + "top": 456.8125 + }, + "post_job_actions": { + "RenameDatasetActionmicrosat_corrected": { + "action_arguments": { + "newname": "Genotype file" + }, + "action_type": "RenameDatasetAction", + "output_name": "microsat_corrected" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs/GenotypeSTR/2.0.0", + "tool_state": "{\"microsat_raw\": \"null\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"microsat_error_profile\": \"null\", \"expectedminorallele\": \"\\\"0.5\\\"\"}", + "tool_version": "2.0.0", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,103 @@ +# *STR-FM*, a short tandem repeat profiling using a flank-based mapping approach + +## User manual and guide +We designed the STR profiling pipeline as a collection of tools which can be executed in both commandline or via a GUI on Galaxy. The easiest way to use STR-FM pipeline is to via Galaxy platform. Current, we have all tools in Galaxy main toolshed (See installation of STR-FM tools from toolshed below) and in Galaxy test website (STR-FM: microsatellite analysis). + +## Overview + +Our tools in ‘str_fm’ can be used to: + +**(1) profile STRs from short read data with STR-FM pipeline** (tools: ‘STR detection’, ‘Read name modifier’, ‘Fetch bases flanking’, ‘Combine mapped faux paired-end reads’, ‘Check STR motif compatibility between reference and read STRs’, ‘Select uninterrupted STRs’) + +This pipeline needs several tools on Galaxy to complete the process. It can be customized with different mapper or STRs detection algorithm. Either single-end or paired-end sequencing data can be utilized; for paired-end read data, each read is treated separately. The core of the pipeline consists of the following three procedures + +First, STR-FM runs a short-read STR detection tool using a string comparison algorithm (see publication details). The algorithm can detect exact (pure, or uninterrupted) STRs (mono- through hexanucleotide STRs greater than or equal to two repeats), incomplete motifs (e.g., ATATATA), interrupted STRs (e.g., AAAATAAAAA), or multiple STRs in a read. Reads that do not have sufficient upstream or downstream sequences flanking the STRs are discarded (we used a threshold of 20 bp on each side of an STR). Each read is split into two “pseudoreads,” containing the upstream and downstream flanks surrounding the STR. + +Second, these are mapped to the reference genome using a standard paired-end read-mapping algorithm, e.g., BWA, Bowtie, or Bowtie2, treating each pair of flanking sequences as a faux paired-end read. + +Finally, STR-FM runs a profiler tool, which groups all reads with STRs that are mapped to the same location in the reference genome. As a result, an array of all STR lengths from the reads mapping to a particular STR-containing locus is generated. + +**(2) genotype STRs with error correction** (tool ‘Correct genotype for STR errors’) + +This pipeline needs only one of our tools to complete process. It will take STR-profile file and sequencine error rates file as inputs. The program will calculate the maximum likelihood of genotype for each STR locus in STR-profile file. Then it will report the mostly likely genotype and the log odds ratio between their probabilities, which can be interpreted as a confidence of genotyping (the more this value deviates from 0, the more confidence we have in this genotype). + +**(3) estimate the minimum informative read depth from error rates** (tools: ‘Generate all possible combination of STR length profile’, ‘Evaluate the probability of the allele combination to generate read profile’, ‘Combine read profile probabilities’) + +This pipeline needs other tools on Galaxy to complete the process. This pipeline will generate all possible read profiles from sequencing error spectrum, select the profiles that can distinguish heterozygote from homozygote, calculate the probability to produce such profiles from sequencing error spectrum, and report the probability that a certain sequence depth can distinguish heterozygote from homozygote under a given sequencing error rates (see publication details). We recommend that you should try to run with less than 10x depth for initial trial. + +**(4) convert informative read depth to locus-specific and genome-wide sequencing depth** (tool ‘Convert informative read depth to sequencing depth’). + +This pipeline needs only one of our tools to complete process. It will convert *informative read depth* to *locus-specific sequencing depth* (given read length) and *genome-wide sequencing depth* (given confidence intervals). + + +## Description of tools + +The short description for each tool is provided below. + +1. “STR detection” = Detect STRs from short reads (FASTQ), reference genome (FASTA), or alignments (SAM) +2. “Read name modifier” = Change space in read name to ‘_’ to prevent read name truncation by mapping tools +3. “Fetch bases flanking” = Generate two FASTQ files containing flanking bases around STRs for mapping as faux paired-end reads +4. “Combine mapped faux paired-end reads” = For each mapped faux paired-end reads, infer STR sequence in reference genome between the two mapped ends of the pair +5. “Check STR motif compatibility between reference and read STRs” = Check if two STRs have the same motif +6. “Select uninterrupted STRs” = Select STRs that do not contain an interruption +7. “Correct genotype for STR errors” = Build error correction model from pre-defined error rates and identify most likely genotype of the input data +8. “Generate all possible combination of STR length profile” = Use STR error spectrum to generate all possible combinations of read profile at each read depth +9. “Evaluate the probability of the allele combination to generate read profile” = Calculate the probability of a given genotype to generate read profiles (instead of finding most likely genotype like tool number 7) +10. “Combine read profile probabilities” = Sum the probability of the given allele combinations to generate read profile at certain read depth +11. “Convert informative read depth to sequencing depth” = Calculate ‘locus-specific’ and ‘genome-wide’ sequencing depth from the given informative read depth +The detailed description for each tool is embedded within the tool. + +## Citing *STR-FM* +Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research + +## Installation of STR-FM tools from toolshed + + +The installation can be done as follows + + +1 Install and set configuration of local Galaxy + +1.1 Download and install Galaxy (https://wiki.galaxyproject.org/Admin/GetGalaxy). Galaxy works on both Unix and Mac OS. + +1.2 From your Galaxy directory, add your E-mail as admin E-mail to the Galaxy configuration file. Depending on the Galaxy version, this file can be either universe_wsgi.ini or config/galaxy.ini (https://wiki.galaxyproject.org/Admin/Interface) + +1.3 Set directory for tool dependencies (step 2 in https://wiki.galaxyproject.org/Admin/Tools/AddToolFromToolShedTutorial). + +1.4 Run local Galaxy from the command line by running ‘sh run.sh’ from your Galaxy directory. + +1.5 Open your Galaxy from your browser at address http://localhost:8080 (https://wiki.galaxyproject.org/Admin/GetGalaxy) + +1.6 Register using your admin E-mail in the ‘User’ tab on the top. + +1.7 Refresh your browser + + +2 Install tools and dependencies + +2.1 From your local galaxy, click ‘Admin’ tab on the top. + +2.2 On the left panel, click ‘Search and browse tool sheds’ under ‘Tool sheds’. ‘Accessible Galaxy tool sheds’ will appear on main panel. + +2.3 Click on ‘Galaxy main tool shed’ and select ‘Browse valid repositories’. (https://wiki.galaxyproject.org/Admin/Tools/AddToolFromToolShedTutorial) + +2.4 Type ‘str_fm in search box and click enter. + +2.5 The ‘suite_str_fm_0_1’ repository that has ‘arkarachai-fungtammasan’ as the owner will appear. The user may click on this repository name and click ‘Preview and install’. The ‘Install to Galaxy’ button will appear on upper right corner. This button allows the user to install all our tools and workflows -- pipelines containing tools for specific purpose such as STR profiling from short read sequencing data, microsatellite detection of the reference genome, and estimating minimum informative read depth. None of our tools have any dependencies. However, some of the other tools that used in our workflows (e.g. SAM flag filter, unique element selection, etc.) are not included in the standard Galaxy installation. For the user’s convenience, we included all dependency tools for the workflows in this repository. Therefore, installing ‘suite_str_fm_0_1’ will be sufficient to operate all workflows we provided. + +2.6 After clicking on ‘Install to Galaxy’ and ‘Install’ button in confirmation page, all our tools, workflows, and test datasets will be downloaded to your local Galaxy. After the download is completed, all our tools will be available on your local Galaxy. If the user wants to use the workflows that we suggested (i.e. STR profiling from short read sequencing data, microsatellite detection of the reference genome, and estimating minimum informative read depth), please proceed to step 3. + +2.7 Refresh your browser + + +3 Install workflows + +3.1 Click on the ‘Admin’ tab at the top again. + +3.2 On the right panel, click ‘Manage installed tool shed repositories’ under ‘Server’. ‘Installed tool shed repositories’ will appear on main panel. + +3.3 Click to open ‘str_fm’ repository. + +3.4 Scroll down to ‘Workflows’ section and select the workflow that you want to install. The SGV graphic of the workflow will appear. + +3.5 Click on the ‘Repository Actions’ on the upper right corner and select ‘Import workflow to Galaxy’. If success, the ‘Workflow <workflow name> imported successfully’ will appear. Once the workflow is imported to your Galaxy, you can view and modify it from ‘Workflow’ tab on the top.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commandline_sample_STR-FM_estimate_mininum_informative_Read_Depth Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,35 @@ +## This is a sample PBS script for profiling STR from reference genome using STR-FM +## +##requirement +##1 STR error rates (can be downloaded from https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files) --> errorrate.bymajorallele +## +echo " " +echo " " +echo "Job started on `hostname` at `date`" +cd /working/directory/ +echo ${MOTIF} +echo ${OUTPUT} +echo " " +echo "Generate all possible combination of STR length profile" ## See detail in profilegenerator.xml on https://github.com/Arkarachai/STR-FM +python profilegenerator.py errorrate.bymajorallele ${MOTIF} 30 > ${OUTPUT}.30 + +echo "remove duplicated profiles" +cat ${OUTPUT}.30 | sort | uniq > ${OUTPUT}.30.sort + +echo "genotyping using error correction model" ## See detail in GenotypingSTR.xml on https://github.com/Arkarachai/STR-FM +python GenotypeTRcorrection.py ${OUTPUT}.30.sort errorrate.bymajorallele ${OUTPUT}.30.prob 0.5 + +echo "select only full motif different --> need to replace 4 with motif size (1-6)" +cat ${OUTPUT}.30.prob | grep hetero | awk '(($7-$8)==4) || (($8-$7)==4) {print $0}' > ${OUTPUT}.30.prob.screen + +echo "Evaluate the probability of the allele combination to generate read profile" ## See detail in probvalueforhetero.xml on https://github.com/Arkarachai/STR-FM +python heteroprob.py ${OUTPUT}.30.prob.screen ${INPUT} > ${OUTPUT}.30.bino + +echo "formatting" +cat ${OUTPUT}.30.bino | sort -k 12n,12 -k 6n,6 > ${OUTPUT}.30.bino.sort + +echo "Combine read profile probabilities" ## See detail in combineprobforallelecombination.xml on https://github.com/Arkarachai/STR-FM +python combinedprobforallelecombination.py ${OUTPUT}.30.bino.sort > ${OUTPUT}.30.bino.sort.plot + + +echo "Job end on `hostname` at `date`"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commandline_sample_STR-FM_reference_profiling Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,25 @@ +## This is a sample PBS script for profiling STR from reference genome using STR-FM version 1.0.0 (April 20, 2014) +## +##requirement +##1 reference genome in FASTA format --> ${INPUT}.fa +## +echo " " +echo " " +echo "Job started on `hostname` at `date`" +cd /working/directory/ +echo " " +echo " detect STR in reference genome" ## See detail in microsatellite.xml on https://github.com/Arkarachai/STR-FM +python microsatellite.py ${INPUT}.fa --fasta --period=1 --partialmotifs --minlength=4 --prefix=0 --suffix=0 --hamming=0 --multipleruns --flankdisplay=0 --splitbyvalidity >${INPUT}.mono.out +python microsatellite.py ${INPUT}.fa --fasta --period=2 --partialmotifs --minlength=6 --prefix=0 --suffix=0 --hamming=0 --multipleruns --flankdisplay=0 --splitbyvalidity >${INPUT}.di.out +python microsatellite.py ${INPUT}.fa --fasta --period=3 --partialmotifs --minlength=6 --prefix=0 --suffix=0 --hamming=0 --multipleruns --flankdisplay=0 --splitbyvalidity >${INPUT}.tri.out +python microsatellite.py ${INPUT}.fa --fasta --period=4 --partialmotifs --minlength=8 --prefix=0 --suffix=0 --hamming=0 --multipleruns --flankdisplay=0 --splitbyvalidity >${INPUT}.tetra.out + +echo "formatting" +cat ${INPUT}.mono.out | awk 'BEGIN{FS="\t";OFS="\t"};{print $6,$2,$2+$1,$4,$1,length($4) }' > ${INPUT}.mono.TR +cat ${INPUT}.di.out | awk 'BEGIN{FS="\t";OFS="\t"};{print $6,$2,$2+$1,$4,$1,length($4) }' > ${INPUT}.di.TR +cat ${INPUT}.tri.out | awk 'BEGIN{FS="\t";OFS="\t"};{print $6,$2,$2+$1,$4,$1,length($4) }' > ${INPUT}.tri.TR +cat ${INPUT}.tetra.out | awk 'BEGIN{FS="\t";OFS="\t"};{print $6,$2,$2+$1,$4,$1,length($4) }' > ${INPUT}.tetra.TR + + + +echo "Job end on `hostname` at `date`"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commandline_sample_STR-FM_shortread_profiling Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,124 @@ +## This is a sample PBS script for profiling STR from short read using STR-FM version 2.0.0 (April 20, 2015) +## +##requirement +##1 fastq input in sangerfq Phred scale --> ${INPUT}.fastq +##2 index of mapping program (bwa, bowtie, etc) +##3 location of all STR in reference genome (use PBS script name "sampleSTR_reference_profiling.txt) --> /path/to/STR/in/reference/genome.TR (you can make 4 separated TR files for 4 types of STRs) +##4 reference genome in FASTA and in 2bit file --> /path/to/2bit/ref.2bit (use utility from UCSC genome browser to create 2bit file version of reference genome) +##5 local Galaxy (available from Galaxy website for Mac and Unix computer) +##6 STR error rates (can be downloaded from https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files) --> errorrate.bymajorallele +## +echo " " +echo " " +echo "Job started on `hostname` at `date`" +ref=/path/to/reference/sequence/and/bwa/index/ref.fa +export PYTHONPATH=/path/to/galaxy-dist/lib/ +galaxydir=/path/to/galaxy-dist/tools +cd /working/directory/ +echo " " +echo " detect STR in short read" ## See detail in microsatellite.xml on https://github.com/Arkarachai/STR-FM +python microsatellite.py ${INPUT}.fastq --fastq --period=1 --partialmotifs --minlength=5 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.mono.out +python microsatellite.py ${INPUT}.fastq --fastq --period=2 --partialmotifs --minlength=6 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.di.out +python microsatellite.py ${INPUT}.fastq --fastq --period=3 --partialmotifs --minlength=9 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.tri.out +python microsatellite.py ${INPUT}.fastq --fastq --period=4 --partialmotifs --minlength=12 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.tetra.out + +echo "change read name at " ## See detail in space2underscore_readname.xml on https://github.com/Arkarachai/STR-FM +python changespacetounderscore_readname.py ${INPUT}.mono.out ${INPUT}.mono.new 6 +python changespacetounderscore_readname.py ${INPUT}.di.out ${INPUT}.di.new 6 +python changespacetounderscore_readname.py ${INPUT}.tri.out ${INPUT}.tri.new 6 +python changespacetounderscore_readname.py ${INPUT}.tetra.out ${INPUT}.tetra.new 6 + +echo "start fetch flanking at `date`" ## See detail in fetchflank.xml on https://github.com/Arkarachai/STR-FM +python pair_fetch_DNA_ff.py ${INPUT}.mono.new ${INPUT}.mono_ff_L.txt ${INPUT}.mono_ff_R.txt 20 20 +python pair_fetch_DNA_ff.py ${INPUT}.di.new ${INPUT}.di_ff_L.txt ${INPUT}.di_ff_R.txt 20 20 +python pair_fetch_DNA_ff.py ${INPUT}.tri.new ${INPUT}.tri_ff_L.txt ${INPUT}.tri_ff_R.txt 20 20 +python pair_fetch_DNA_ff.py ${INPUT}.tetra.new ${INPUT}.tetra_ff_L.txt ${INPUT}.tetra_ff_R.txt 20 20 + +echo "BWA uniquely mapped no indel no deletion " +bwa aln -n 0 -o 0 ${ref} ${INPUT}.mono_ff_L.txt > ${INPUT}.mono_ff_L.sai +bwa aln -n 0 -o 0 ${ref} ${INPUT}.mono_ff_R.txt > ${INPUT}.mono_ff_R.sai +bwa sampe ${ref} ${INPUT}.mono_ff_L.sai ${INPUT}.mono_ff_R.sai ${INPUT}.mono_ff_L.txt ${INPUT}.mono_ff_R.txt > ${INPUT}.mono.sam +samtools view -Sb -F 12 -q 1 ${INPUT}.mono.sam > ${INPUT}.mono.n.all.bam +bwa aln -n 0 -o 0 ${ref} ${INPUT}.di_ff_L.txt > ${INPUT}.di_ff_L.sai +bwa aln -n 0 -o 0 ${ref} ${INPUT}.di_ff_R.txt > ${INPUT}.di_ff_R.sai +bwa sampe ${ref} ${INPUT}.di_ff_L.sai ${INPUT}.di_ff_R.sai ${INPUT}.di_ff_L.txt ${INPUT}.di_ff_R.txt > ${INPUT}.di.sam +samtools view -Sb -F 12 -q 1 ${INPUT}.di.sam > ${INPUT}.di.n.all.bam +bwa aln -n 0 -o 0 ${ref} ${INPUT}.tri_ff_L.txt > ${INPUT}.tri_ff_L.sai +bwa aln -n 0 -o 0 ${ref} ${INPUT}.tri_ff_R.txt > ${INPUT}.tri_ff_R.sai +bwa sampe ${ref} ${INPUT}.tri_ff_L.sai ${INPUT}.tri_ff_R.sai ${INPUT}.tri_ff_L.txt ${INPUT}.tri_ff_R.txt > ${INPUT}.tri.sam +samtools view -Sb -F 12 -q 1 ${INPUT}.tri.sam > ${INPUT}.tri.n.all.bam +bwa aln -n 0 -o 0 ${ref} ${INPUT}.tetra_ff_L.txt > ${INPUT}.tetra_ff_L.sai +bwa aln -n 0 -o 0 ${ref} ${INPUT}.tetra_ff_R.txt > ${INPUT}.tetra_ff_R.sai +bwa sampe ${ref} ${INPUT}.tetra_ff_L.sai ${INPUT}.tetra_ff_R.sai ${INPUT}.tetra_ff_L.txt ${INPUT}.tetra_ff_R.txt > ${INPUT}.tetra.sam +samtools view -Sb -F 12 -q 1 ${INPUT}.tetra.sam > ${INPUT}.tetra.n.all.bam + +echo "sort result by read name" +samtools sort -n ${INPUT}.mono.n.all.bam ${INPUT}.mono.n.sorted.all +samtools sort -n ${INPUT}.di.n.all.bam ${INPUT}.di.n.sorted.all +samtools sort -n ${INPUT}.tri.n.all.bam ${INPUT}.tri.n.sorted.all +samtools sort -n ${INPUT}.tetra.n.all.bam ${INPUT}.tetra.n.sorted.all +samtools view -h -o ${INPUT}.mono.n.sorted.all.sam ${INPUT}.mono.n.sorted.all.bam +samtools view -h -o ${INPUT}.di.n.sorted.all.sam ${INPUT}.di.n.sorted.all.bam +samtools view -h -o ${INPUT}.tri.n.sorted.all.sam ${INPUT}.tri.n.sorted.all.bam +samtools view -h -o ${INPUT}.tetra.n.sorted.all.sam ${INPUT}.tetra.n.sorted.all.bam + +echo "merge faux paired end reads" ## See detail in PEsortedSAM2readprofile.xml on https://github.com/Arkarachai/STR-FM +python PEsortedSAM2readprofile.py ${INPUT}.mono.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF +python PEsortedSAM2readprofile.py ${INPUT}.di.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF +python PEsortedSAM2readprofile.py ${INPUT}.tri.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF +python PEsortedSAM2readprofile.py ${INPUT}.tetra.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF + +echo "join mapped coordinate with STR length using read name" +python ${galaxydir}/filters/join.py ${INPUT}.mono.new ${INPUT}.mono.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None' +python ${galaxydir}/filters/join.py ${INPUT}.di.new ${INPUT}.di.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None' +python ${galaxydir}/filters/join.py ${INPUT}.tri.new ${INPUT}.tri.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None' +python ${galaxydir}/filters/join.py ${INPUT}.tetra.new ${INPUT}.tetra.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None' + +echo "join mapped coordinate and STR length with STR location in genome" +python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.mono.RF.j ${INPUT}.mono.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f +python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.di.RF.j ${INPUT}.di.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f +python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.tri.RF.j ${INPUT}.tri.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f +python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.tetra.RF.j ${INPUT}.tetra.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f + +echo "remove incompatible motif (remove incorrect mapped reads given that there is no STR motif difference from reference genome)" ## See detail in microsatcompat.xml on https://github.com/Arkarachai/STR-FM +python microsatcompat.py ${INPUT}.mono.gop 4 10 > ${INPUT}.mono.fulltable1 +python microsatcompat.py ${INPUT}.di.gop 4 10 > ${INPUT}.di.fulltable1 +python microsatcompat.py ${INPUT}.tri.gop 4 10 > ${INPUT}.tri.fulltable1 +python microsatcompat.py ${INPUT}.tetra.gop 4 10 > ${INPUT}.tetra.fulltable1 + +echo "remove shifting flanking location (remove cases that come from STR interruption or flanking bases are misread as STRs)" +cat ${INPUT}.mono.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.mono.fulltable2 +cat ${INPUT}.di.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.di.fulltable2 +cat ${INPUT}.tri.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.tri.fulltable2 +cat ${INPUT}.tetra.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.tetra.fulltable2 + +echo "keep only column that are necessary for profiling" +cat ${INPUT}.mono.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.mono.cuttmp0 +cat ${INPUT}.di.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.di.cuttmp0 +cat ${INPUT}.tri.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.tri.cuttmp0 +cat ${INPUT}.tetra.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.tetra.cuttmp0 + +echo "If you multiple analysis by splitting initial fastq, you should merge (cat) all results from the same sample after this step" + +echo "create genomic coordinate column and group by that column" +perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.mono.cuttmp0 ${INPUT}.mono.cuttmp1 "_" "no" +python ${galaxydir}/filters/mergeCols.py ${INPUT}.mono.cuttmp1 ${INPUT}.mono.cuttmp2 1 7 2 7 3 +python ${galaxydir}/stats/grouping.py ${INPUT}.mono.cuttmp3 ${INPUT}.mono.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0' +perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.di.cuttmp0 ${INPUT}.di.cuttmp1 "_" "no" +python ${galaxydir}/filters/mergeCols.py ${INPUT}.di.cuttmp1 ${INPUT}.di.cuttmp2 1 7 2 7 3 +python ${galaxydir}/stats/grouping.py ${INPUT}.di.cuttmp3 ${INPUT}.di.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0' +perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.tri.cuttmp0 ${INPUT}.tri.cuttmp1 "_" "no" +python ${galaxydir}/filters/mergeCols.py ${INPUT}.tri.cuttmp1 ${INPUT}.tri.cuttmp2 1 7 2 7 3 +python ${galaxydir}/stats/grouping.py ${INPUT}.tri.cuttmp3 ${INPUT}.tri.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0' +perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.tetra.cuttmp0 ${INPUT}.tetra.cuttmp1 "_" "no" +python ${galaxydir}/filters/mergeCols.py ${INPUT}.tetra.cuttmp1 ${INPUT}.tetra.cuttmp2 1 7 2 7 3 +python ${galaxydir}/stats/grouping.py ${INPUT}.tetra.cuttmp3 ${INPUT}.tetra.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0' + +echo "you may filter for minimum sequencing depth here" + +echo "genotyping using error correction model" ## See detail in GenotypingSTR.xml on https://github.com/Arkarachai/STR-FM +cat ${INPUT}.mono.cuttmp2 ${INPUT}.di.cuttmp2 ${INPUT}.tri.cuttmp2 ${INPUT}.tetra.cuttmp2 > ${INPUT}.step5 +python GenotypeTRcorrection.py ${INPUT}.step5 errorrate.bymajorallele ${INPUT}.step5.result 0.5 +## final output is ${INPUT}.step5.result + +echo "Job end on `hostname` at `date`"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/C_sample_fastq Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,8 @@ +@IL2_40_2_1_735_755 +ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGAAATAACAT ++ +IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* +@IL2_40_2_1_919_700 +ATAAGGAAAAAAAAAAAAAAAACCAGGTCTTTTTTTTTTTTTTTTTGTTAT ++ +IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/C_sample_snoope Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,4 @@ +3 33 15 A 0 IL2_40_2_1_735_755_1_per1_2 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTaaaGTGCTGAAATAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* +3 42 6 A 0 IL2_40_2_1_735_755_1_per1_3 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGaaaTAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* +16 6 29 A 0 IL2_40_2_1_919_700_1_per1_1 ATAAGGaaaaaaaaaaaaaaaaCCAGGTCTTTTTTTTTTTTTTTTTGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$& +17 29 5 T 0 IL2_40_2_1_919_700_1_per1_2 ATAAGGAAAAAAAAAAAAAAAACCAGGTCtttttttttttttttttGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/PCRinclude.allrate.bymajorallele Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,997 @@ +10 10 91456 A +10 9 1259 A +10 11 605 A +10 8 16 A +10 12 8 A +10 7 2 A +11 11 39657 A +11 10 1211 A +11 12 514 A +11 9 54 A +11 13 9 A +11 8 3 A +11 14 1 A +12 12 18850 A +12 11 986 A +12 13 417 A +12 10 73 A +12 14 8 A +12 9 1 A +12 8 1 A +13 13 10201 A +13 12 885 A +13 14 320 A +13 11 83 A +13 15 12 A +13 10 8 A +14 14 3649 A +14 13 409 A +14 15 151 A +14 12 62 A +14 11 6 A +14 16 5 A +14 10 1 A +15 15 847 A +15 14 140 A +15 16 60 A +15 13 20 A +15 17 4 A +15 12 3 A +16 16 182 A +16 15 60 A +16 17 14 A +16 14 12 A +16 13 1 A +16 12 1 A +16 18 1 A +17 17 11 A +17 16 5 A +17 15 2 A +17 18 1 A +18 18 4 A +18 17 2 A +5 5 10047169 A +5 6 44 A +6 6 2808071 A +6 5 195 A +6 7 69 A +7 7 1097174 A +7 6 313 A +7 8 83 A +7 5 6 A +8 8 369496 A +8 7 387 A +8 9 248 A +8 6 3 A +8 10 2 A +9 9 184958 A +9 8 707 A +9 10 486 A +9 7 5 A +9 11 4 A +10 10 46 C +10 9 3 C +5 5 1354993 C +5 6 7 C +6 6 193431 C +6 5 14 C +6 7 2 C +7 7 22171 C +7 6 4 C +8 8 2966 C +8 9 3 C +8 7 3 C +9 9 638 C +9 8 8 C +9 7 1 C +10 10 21211 AC +10 8 3 AC +10 12 1 AC +11 11 15048 AC +11 9 10 AC +12 12 6043 AC +12 10 15 AC +12 14 1 AC +13 13 5070 AC +13 11 40 AC +13 15 1 AC +14 14 3093 AC +14 12 44 AC +14 10 1 AC +15 15 2848 AC +15 13 31 AC +15 17 1 AC +16 16 1273 AC +16 14 30 AC +16 12 2 AC +17 17 1297 AC +17 15 27 AC +18 18 1269 AC +18 16 43 AC +18 20 2 AC +18 14 1 AC +19 19 679 AC +19 17 17 AC +19 21 1 AC +20 20 645 AC +20 18 34 AC +20 22 2 AC +20 16 1 AC +21 21 723 AC +21 19 28 AC +21 17 1 AC +21 23 1 AC +22 22 499 AC +22 20 29 AC +22 18 3 AC +23 23 540 AC +23 21 30 AC +23 19 2 AC +23 25 1 AC +24 24 385 AC +24 22 38 AC +24 26 2 AC +24 20 1 AC +25 25 407 AC +25 23 22 AC +25 27 2 AC +25 21 1 AC +26 26 257 AC +26 24 30 AC +26 22 3 AC +26 28 1 AC +26 20 1 AC +27 27 339 AC +27 25 28 AC +27 23 3 AC +27 29 2 AC +28 28 202 AC +28 26 17 AC +28 30 6 AC +29 29 277 AC +29 27 29 AC +29 31 6 AC +29 25 3 AC +30 30 117 AC +30 28 12 AC +30 32 3 AC +30 18 1 AC +31 31 144 AC +31 29 18 AC +31 27 4 AC +31 33 2 AC +32 32 101 AC +32 30 23 AC +32 28 2 AC +32 34 2 AC +32 26 1 AC +33 33 106 AC +33 31 15 AC +33 35 3 AC +33 29 1 AC +34 34 33 AC +34 32 7 AC +35 35 21 AC +35 33 4 AC +35 31 1 AC +36 36 12 AC +36 34 1 AC +37 37 10 AC +37 35 3 AC +37 31 1 AC +37 39 1 AC +38 38 4 AC +38 36 1 AC +6 6 1521439 AC +7 7 513952 AC +8 8 134603 AC +8 6 2 AC +9 9 60741 AC +9 7 3 AC +9 11 1 AC +10 10 21772 AG +10 8 3 AG +10 12 1 AG +11 11 13880 AG +11 9 10 AG +11 13 1 AG +12 12 5628 AG +12 10 13 AG +12 14 4 AG +13 13 4494 AG +13 11 17 AG +14 14 1898 AG +14 12 15 AG +15 15 2427 AG +15 13 18 AG +16 16 1076 AG +16 14 24 AG +16 12 1 AG +17 17 874 AG +17 15 12 AG +17 19 1 AG +17 13 1 AG +18 18 536 AG +18 16 20 AG +18 14 1 AG +19 19 563 AG +19 17 25 AG +20 20 201 AG +20 18 14 AG +21 21 260 AG +21 19 10 AG +22 22 83 AG +22 20 5 AG +23 23 147 AG +23 21 5 AG +23 25 1 AG +24 24 99 AG +24 22 4 AG +24 18 1 AG +25 25 62 AG +25 23 3 AG +25 27 1 AG +26 26 38 AG +26 24 8 AG +27 27 24 AG +27 25 3 AG +27 23 1 AG +28 28 14 AG +28 26 2 AG +29 29 12 AG +29 27 5 AG +29 31 1 AG +30 30 7 AG +30 28 2 AG +31 31 7 AG +31 27 3 AG +31 23 1 AG +32 32 4 AG +32 28 1 AG +6 6 1880822 AG +7 7 684837 AG +7 9 1 AG +8 8 183381 AG +9 9 75547 AG +9 7 6 AG +9 11 1 AG +10 10 18179 AT +10 8 7 AT +10 12 4 AT +11 11 8969 AT +11 9 5 AT +11 13 2 AT +12 12 4888 AT +12 10 8 AT +12 14 2 AT +13 13 2785 AT +13 11 17 AT +13 15 1 AT +14 14 2310 AT +14 12 40 AT +14 16 4 AT +14 10 2 AT +15 15 1461 AT +15 13 33 AT +15 11 1 AT +15 17 1 AT +16 16 879 AT +16 14 42 AT +16 18 2 AT +16 12 1 AT +17 17 599 AT +17 15 38 AT +17 19 2 AT +17 13 1 AT +18 18 367 AT +18 16 29 AT +18 20 7 AT +18 14 1 AT +19 19 223 AT +19 17 34 AT +19 21 3 AT +20 20 97 AT +20 18 14 AT +20 16 2 AT +20 22 1 AT +21 21 60 AT +21 19 18 AT +21 17 1 AT +22 22 53 AT +22 20 15 AT +22 24 5 AT +22 18 3 AT +23 23 11 AT +23 21 1 AT +24 24 7 AT +24 20 2 AT +24 22 2 AT +6 6 1671932 AT +6 8 1 AT +7 7 595145 AT +8 8 195533 AT +8 10 5 AT +8 6 2 AT +9 9 52576 AT +9 7 3 AT +10 10 17 CG +11 11 17 CG +12 12 6 CG +6 6 4097 CG +7 7 678 CG +8 8 184 CG +9 9 19 CG +10 10 19552 AAC +11 11 19003 AAC +12 12 6245 AAC +12 9 1 AAC +13 13 3406 AAC +14 14 8448 AAC +14 11 2 AAC +15 15 2356 AAC +15 12 6 AAC +16 16 1373 AAC +16 13 4 AAC +17 17 3140 AAC +17 14 5 AAC +18 18 944 AAC +18 15 2 AAC +19 19 456 AAC +19 16 1 AAC +20 20 1474 AAC +20 17 3 AAC +21 21 328 AAC +21 18 1 AAC +22 22 178 AAC +23 23 538 AAC +23 26 1 AAC +24 24 112 AAC +25 25 60 AAC +26 26 239 AAC +26 23 1 AAC +27 27 45 AAC +28 28 58 AAC +28 25 2 AAC +29 29 77 AAC +30 30 17 AAC +31 31 38 AAC +31 28 1 AAC +32 32 94 AAC +32 29 3 AAC +33 33 15 AAC +35 35 55 AAC +35 32 1 AAC +38 38 12 AAC +41 41 6 AAC +9 9 57212 AAC +10 10 31455 AAG +11 11 11876 AAG +12 12 3458 AAG +12 9 6 AAG +13 13 1141 AAG +14 14 928 AAG +15 15 548 AAG +15 12 4 AAG +16 16 189 AAG +17 17 235 AAG +18 18 63 AAG +19 19 66 AAG +20 20 122 AAG +22 22 11 AAG +23 23 33 AAG +9 9 104524 AAG +10 10 69106 AAT +11 11 30381 AAT +12 12 12001 AAT +12 9 1 AAT +13 13 7168 AAT +13 10 2 AAT +14 14 5470 AAT +14 11 3 AAT +15 15 2524 AAT +15 12 3 AAT +16 16 1733 AAT +16 13 1 AAT +17 17 1324 AAT +17 14 3 AAT +18 18 1022 AAT +18 15 3 AAT +19 19 502 AAT +19 16 3 AAT +20 20 570 AAT +20 17 2 AAT +21 21 370 AAT +21 18 1 AAT +22 22 98 AAT +23 23 164 AAT +23 20 3 AAT +24 24 143 AAT +24 21 1 AAT +25 25 122 AAT +25 22 1 AAT +26 26 45 AAT +26 23 2 AAT +27 27 32 AAT +27 24 1 AAT +28 28 6 AAT +29 29 64 AAT +29 26 1 AAT +30 30 28 AAT +30 24 1 AAT +31 31 9 AAT +32 32 9 AAT +32 29 1 AAT +38 38 6 AAT +9 9 179182 AAT +9 12 1 AAT +10 10 14290 ACC +11 11 5692 ACC +12 12 1795 ACC +13 13 1141 ACC +14 14 545 ACC +15 15 308 ACC +16 16 162 ACC +17 17 107 ACC +18 18 23 ACC +19 19 35 ACC +20 20 44 ACC +21 21 5 ACC +22 22 5 ACC +22 19 1 ACC +23 23 11 ACC +25 25 7 ACC +26 26 7 ACC +27 27 10 ACC +28 28 24 ACC +28 25 1 ACC +35 35 5 ACC +9 9 46614 ACC +10 10 2865 ACG +11 11 900 ACG +12 12 325 ACG +13 13 82 ACG +14 14 83 ACG +9 9 9465 ACG +10 10 6269 ACT +11 11 2284 ACT +12 12 634 ACT +13 13 441 ACT +14 14 295 ACT +15 15 118 ACT +16 16 60 ACT +17 17 71 ACT +18 18 58 ACT +19 19 42 ACT +20 20 24 ACT +24 24 5 ACT +37 37 8 ACT +41 41 5 ACT +41 35 1 ACT +9 9 20025 ACT +10 10 2897 AGC +11 11 948 AGC +12 12 320 AGC +13 13 97 AGC +14 14 87 AGC +15 15 13 AGC +16 16 9 AGC +17 17 25 AGC +17 14 1 AGC +9 9 9579 AGC +10 10 21141 AGG +11 11 8128 AGG +12 12 2964 AGG +13 13 1209 AGG +14 14 860 AGG +15 15 320 AGG +16 16 190 AGG +17 17 225 AGG +18 18 147 AGG +20 20 80 AGG +21 21 9 AGG +22 22 35 AGG +23 23 27 AGG +24 24 8 AGG +26 26 9 AGG +9 9 57350 AGG +10 10 5964 ATC +11 11 2346 ATC +12 12 789 ATC +13 13 386 ATC +14 14 285 ATC +15 15 165 ATC +16 16 93 ATC +17 17 149 ATC +18 18 51 ATC +19 19 6 ATC +20 20 15 ATC +21 21 15 ATC +22 22 29 ATC +23 23 25 ATC +24 24 24 ATC +26 26 34 ATC +27 27 9 ATC +28 28 30 ATC +29 29 8 ATC +30 30 8 ATC +31 31 11 ATC +34 34 11 ATC +34 31 1 ATC +36 36 5 ATC +9 9 19837 ATC +10 10 11 CCG +11 11 24 CCG +14 14 5 CCG +16 16 5 CCG +9 9 135 CCG +12 12 10192 AAAC +13 13 4917 AAAC +14 14 4704 AAAC +15 15 12713 AAAC +16 16 2415 AAAC +17 17 1431 AAAC +18 18 1861 AAAC +18 14 2 AAAC +19 19 5254 AAAC +19 15 2 AAAC +19 23 1 AAAC +20 20 913 AAAC +20 16 1 AAAC +21 21 615 AAAC +22 22 509 AAAC +22 18 2 AAAC +23 23 2249 AAAC +23 19 5 AAAC +23 15 1 AAAC +24 24 329 AAAC +24 20 2 AAAC +25 25 230 AAAC +25 21 1 AAAC +26 26 175 AAAC +27 27 548 AAAC +27 23 2 AAAC +28 28 195 AAAC +28 24 1 AAAC +29 29 62 AAAC +30 30 67 AAAC +31 31 165 AAAC +31 27 1 AAAC +32 32 64 AAAC +33 33 63 AAAC +34 34 21 AAAC +35 35 40 AAAC +36 36 55 AAAC +37 37 6 AAAC +38 38 8 AAAC +39 39 10 AAAC +40 40 7 AAAC +45 45 7 AAAC +12 12 12855 AAAG +12 16 13 AAAG +12 20 9 AAAG +12 18 2 AAAG +13 13 6727 AAAG +14 14 3699 AAAG +14 13 8 AAAG +15 15 3858 AAAG +15 17 6 AAAG +15 13 1 AAAG +16 16 1244 AAAG +17 17 750 AAAG +17 13 1 AAAG +18 18 380 AAAG +18 20 5 AAAG +18 14 1 AAAG +19 19 1164 AAAG +19 15 1 AAAG +20 20 153 AAAG +21 21 186 AAAG +22 22 115 AAAG +23 23 321 AAAG +23 19 1 AAAG +24 24 82 AAAG +25 25 89 AAAG +26 26 26 AAAG +26 13 3 AAAG +27 27 64 AAAG +28 28 36 AAAG +29 29 32 AAAG +31 31 31 AAAG +33 33 19 AAAG +35 35 10 AAAG +36 36 11 AAAG +38 38 16 AAAG +41 41 5 AAAG +12 12 23143 AAAT +13 13 10045 AAAT +14 14 6815 AAAT +15 15 8439 AAAT +16 16 3102 AAAT +16 12 2 AAAT +17 17 2018 AAAT +17 13 2 AAAT +18 18 2044 AAAT +19 19 2955 AAAT +19 15 1 AAAT +19 14 1 AAAT +20 20 909 AAAT +21 21 711 AAAT +21 17 2 AAAT +22 22 500 AAAT +22 18 2 AAAT +23 23 993 AAAT +23 19 3 AAAT +24 24 382 AAAT +24 20 3 AAAT +25 25 190 AAAT +26 26 185 AAAT +26 22 1 AAAT +27 27 281 AAAT +27 23 2 AAAT +28 28 165 AAAT +28 24 2 AAAT +29 29 48 AAAT +30 30 46 AAAT +31 31 101 AAAT +32 32 28 AAAT +33 33 19 AAAT +34 34 24 AAAT +34 30 1 AAAT +35 35 41 AAAT +35 31 2 AAAT +36 36 16 AAAT +37 37 6 AAAT +38 38 5 AAAT +39 39 20 AAAT +39 35 1 AAAT +40 40 5 AAAT +41 41 10 AAAT +42 42 6 AAAT +45 45 6 AAAT +12 12 1468 AACC +13 13 590 AACC +14 14 318 AACC +15 15 163 AACC +16 16 102 AACC +17 17 106 AACC +18 18 18 AACC +19 19 34 AACC +20 20 7 AACC +22 22 7 AACC +23 23 13 AACC +24 24 16 AACC +25 25 9 AACC +31 31 9 AACC +12 12 214 AACG +13 13 135 AACG +14 14 39 AACG +15 15 45 AACG +12 12 522 AACT +13 13 142 AACT +14 14 143 AACT +15 15 88 AACT +16 16 16 AACT +17 17 51 AACT +18 18 7 AACT +20 20 21 AACT +21 21 27 AACT +23 23 7 AACT +24 24 11 AACT +30 30 5 AACT +12 12 346 AAGC +13 13 83 AAGC +14 14 60 AAGC +15 15 40 AAGC +16 16 21 AAGC +18 18 9 AAGC +19 19 7 AAGC +12 12 4943 AAGG +13 13 2714 AAGG +14 14 1385 AAGG +14 15 3 AAGG +15 15 949 AAGG +16 16 612 AAGG +16 14 4 AAGG +17 17 331 AAGG +18 18 362 AAGG +19 19 204 AAGG +20 20 138 AAGG +21 21 149 AAGG +22 22 68 AAGG +23 23 49 AAGG +24 24 27 AAGG +25 25 44 AAGG +26 26 8 AAGG +27 27 14 AAGG +28 28 14 AAGG +29 29 14 AAGG +30 30 12 AAGG +31 31 23 AAGG +34 34 11 AAGG +43 43 6 AAGG +12 12 2676 AAGT +13 13 1438 AAGT +14 14 940 AAGT +15 15 649 AAGT +16 16 305 AAGT +17 17 291 AAGT +18 18 181 AAGT +19 19 55 AAGT +20 20 73 AAGT +21 21 8 AAGT +22 22 43 AAGT +22 26 1 AAGT +23 23 32 AAGT +23 19 1 AAGT +24 24 18 AAGT +25 25 19 AAGT +26 26 8 AAGT +27 27 12 AAGT +29 29 18 AAGT +30 30 12 AAGT +31 31 12 AAGT +32 32 11 AAGT +33 33 35 AAGT +34 34 9 AAGT +35 35 6 AAGT +12 12 594 AATC +13 13 205 AATC +14 14 88 AATC +15 15 112 AATC +16 16 20 AATC +17 17 81 AATC +18 18 23 AATC +21 21 13 AATC +22 22 8 AATC +24 24 19 AATC +26 26 7 AATC +28 28 9 AATC +33 33 6 AATC +12 12 2293 AATG +13 13 1226 AATG +14 14 678 AATG +15 15 455 AATG +16 16 222 AATG +17 17 211 AATG +18 18 104 AATG +19 19 79 AATG +20 20 40 AATG +21 21 33 AATG +22 22 73 AATG +23 23 24 AATG +24 24 16 AATG +25 25 18 AATG +26 26 15 AATG +27 27 22 AATG +27 23 1 AATG +28 28 5 AATG +32 32 17 AATG +33 33 16 AATG +12 12 2633 AATT +13 13 1086 AATT +14 14 1052 AATT +15 15 386 AATT +16 16 393 AATT +17 17 98 AATT +18 18 104 AATT +19 19 105 AATT +20 20 34 AATT +21 21 12 AATT +22 22 20 AATT +25 25 18 AATT +26 26 25 AATT +27 27 7 AATT +29 29 7 AATT +35 35 12 AATT +12 12 1406 ACAG +13 13 964 ACAG +14 14 300 ACAG +15 15 130 ACAG +16 16 102 ACAG +17 17 49 ACAG +18 18 30 ACAG +19 19 88 ACAG +20 20 5 ACAG +23 23 5 ACAG +12 12 4868 ACAT +12 15 4 ACAT +13 13 3216 ACAT +14 14 957 ACAT +15 15 1052 ACAT +16 16 588 ACAT +17 17 422 ACAT +18 18 239 ACAT +19 19 238 ACAT +19 15 1 ACAT +20 20 25 ACAT +21 21 79 ACAT +22 22 20 ACAT +23 23 38 ACAT +27 27 42 ACAT +29 29 18 ACAT +31 31 5 ACAT +32 32 5 ACAT +35 35 6 ACAT +36 36 9 ACAT +41 41 14 ACAT +44 44 8 ACAT +44 40 1 ACAT +50 50 12 ACAT +12 12 833 ACCC +13 13 345 ACCC +14 14 190 ACCC +15 15 60 ACCC +16 16 12 ACCC +17 17 15 ACCC +19 19 8 ACCG +12 12 416 ACCT +13 13 123 ACCT +14 14 140 ACCT +15 15 69 ACCT +16 16 41 ACCT +17 17 45 ACCT +19 19 18 ACCT +20 20 27 ACCT +21 21 19 ACCT +22 22 6 ACCT +27 27 13 ACCT +28 28 7 ACCT +29 29 9 ACCT +30 30 7 ACCT +34 34 6 ACCT +45 45 5 ACCT +12 12 84 ACGC +13 13 52 ACGC +15 15 63 ACGC +12 12 433 ACGG +13 13 163 ACGG +14 14 38 ACGG +15 15 44 ACGG +16 16 7 ACGG +17 17 11 ACGG +19 19 6 ACGG +25 25 10 ACGG +12 12 1119 ACGT +13 13 509 ACGT +14 14 338 ACGT +15 15 16 ACGT +16 16 66 ACGT +17 17 7 ACGT +19 19 27 ACGT +12 12 2211 ACTC +13 13 685 ACTC +14 14 188 ACTC +15 15 151 ACTC +16 16 91 ACTC +18 18 17 ACTC +19 19 24 ACTC +20 20 23 ACTC +21 21 13 ACTC +23 23 19 ACTC +45 45 8 ACTC +12 12 161 ACTG +13 13 69 ACTG +14 14 7 ACTG +15 15 14 ACTG +16 16 15 ACTG +12 12 3118 AGAT +13 13 1216 AGAT +14 14 1084 AGAT +15 15 869 AGAT +16 16 508 AGAT +17 17 322 AGAT +18 18 159 AGAT +19 19 258 AGAT +20 20 63 AGAT +21 21 84 AGAT +22 22 69 AGAT +22 14 6 AGAT +23 23 112 AGAT +24 24 107 AGAT +25 25 36 AGAT +26 26 113 AGAT +27 27 42 AGAT +28 28 58 AGAT +29 29 37 AGAT +30 30 16 AGAT +31 31 32 AGAT +32 32 24 AGAT +33 33 10 AGAT +34 34 43 AGAT +35 35 6 AGAT +36 36 13 AGAT +36 32 1 AGAT +37 37 35 AGAT +38 38 34 AGAT +39 39 20 AGAT +39 35 2 AGAT +40 40 27 AGAT +41 41 29 AGAT +42 42 30 AGAT +43 43 87 AGAT +44 44 67 AGAT +45 45 20 AGAT +46 46 15 AGAT +47 47 28 AGAT +48 48 26 AGAT +49 49 13 AGAT +50 50 11 AGAT +52 52 5 AGAT +54 54 6 AGAT +12 12 236 AGCC +13 13 109 AGCC +14 14 17 AGCC +15 15 14 AGCC +16 16 8 AGCC +18 18 12 AGCC +21 21 18 AGCC +23 23 13 AGCC +12 12 23 AGCG +13 13 19 AGCG +18 18 9 AGCG +12 12 272 AGCT +13 13 89 AGCT +14 14 108 AGCT +15 15 49 AGCT +16 16 19 AGCT +17 17 19 AGCT +18 18 19 AGCT +19 19 44 AGCT +22 22 12 AGCT +27 27 16 AGCT +12 12 87 AGGC +13 13 19 AGGC +14 14 16 AGGC +18 18 7 AGGC +12 12 3610 AGGG +13 13 1980 AGGG +14 14 1095 AGGG +15 15 624 AGGG +16 16 159 AGGG +17 17 59 AGGG +18 18 43 AGGG +19 19 60 AGGG +20 20 49 AGGG +21 21 12 AGGG +23 23 10 AGGG +12 12 531 ATCC +13 13 323 ATCC +14 14 221 ATCC +15 15 58 ATCC +16 16 78 ATCC +17 17 38 ATCC +18 18 12 ATCC +19 19 19 ATCC +20 20 17 ATCC +21 21 44 ATCC +22 22 12 ATCC +23 23 39 ATCC +24 24 11 ATCC +25 25 12 ATCC +27 27 10 ATCC +32 32 6 ATCC +39 39 8 ATCC +40 40 6 ATCC +48 48 7 ATCC +12 12 272 ATCG +13 13 89 ATCG +14 14 108 ATCG +15 15 49 ATCG +16 16 19 ATCG +17 17 19 ATCG +18 18 19 ATCG +19 19 44 ATCG +22 22 12 ATCG +27 27 16 ATCG +12 12 1119 ATGC +13 13 509 ATGC +14 14 338 ATGC +15 15 16 ATGC +16 16 66 ATGC +17 17 7 ATGC +19 19 27 ATGC +12 12 13 CCCG +12 12 178 AGTC +13 13 77 AGTC +14 14 13 AGTC +15 15 12 AGTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/combineprob_out.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,7 @@ +read_depth allele heterozygous_prob motif +2 10_11 0.485943568663 A +2 11_12 0.472130683091 A +2 9_10 0.494635026326 A +3 10_11 0.71878954705 A +3 11_12 0.688571908761 A +3 9_10 0.73801798345 A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatcompat_in.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,3 @@ +15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT +17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC +17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 AGAGAGAG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatcompat_out.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,1 @@ +15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatellite_flanking_L.fastq Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,4 @@ +@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 +TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT ++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 +GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatellite_flanking_R.fastq Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,4 @@ +@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 +TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG ++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 +GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatpurity_in.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,3 @@ +15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT +15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATTATATATATATAT +17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/microsatpurity_out.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,2 @@ +15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT +17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/nice1tab.py Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,6 @@ +import sys +fd=open(sys.argv[1]) +lines=fd.readlines() +for line in lines: + temp=line.strip().split() + print '\t'.join(temp) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/probvalueforhetero_in.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,9 @@ +chr 9,10 A hetero -1.27220836321 10 10 9 +chr 10,11 A hetero -0.939119957032 11 11 10 +chr 11,12 A hetero -0.720375026792 12 12 11 +chr 9,9,10 A hetero -1.6841441619 9 9 10 +chr 9,10,10 A hetero -0.97233405327 10 10 9 +chr 10,10,11 A hetero -1.29451118958 10 10 11 +chr 10,11,11 A hetero -0.641022011041 11 11 10 +chr 11,11,12 A hetero -1.01921634129 11 11 12 +chr 11,12,12 A hetero -0.425116661902 12 12 11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/probvalueforhetero_out.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,9 @@ +chr 9,10 A hetero -1.27220836321 10 10 9 0.247317513163 2 0.494635026326 2 +chr 10,11 A hetero -0.939119957032 11 11 10 0.242971784331 2 0.485943568663 2 +chr 11,12 A hetero -0.720375026792 12 12 11 0.236065341545 2 0.472130683091 2 +chr 9,9,10 A hetero -1.6841441619 9 9 10 0.124528157268 3 0.373584471803 3 +chr 9,10,10 A hetero -0.97233405327 10 10 9 0.121477837216 3 0.364433511647 3 +chr 10,10,11 A hetero -1.29451118958 10 10 11 0.122575544751 3 0.367726634253 3 +chr 10,11,11 A hetero -0.641022011041 11 11 10 0.117020970932 3 0.351062912797 3 +chr 11,11,12 A hetero -1.01921634129 11 11 12 0.11865253007 3 0.35595759021 3 +chr 11,12,12 A hetero -0.425116661902 12 12 11 0.110871439517 3 0.332614318551 3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/profilegenerator_in.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,6 @@ +9 9 100000 +10 10 91456 +10 9 1259 +11 11 39657 +11 10 1211 +11 12 514
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/profilegenerator_out.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,30 @@ +chr 9,9 A +chr 9,10 A +chr 9,11 A +chr 9,12 A +chr 10,10 A +chr 10,11 A +chr 10,12 A +chr 11,11 A +chr 11,12 A +chr 12,12 A +chr 9,9,9 A +chr 9,9,10 A +chr 9,9,11 A +chr 9,9,12 A +chr 9,10,10 A +chr 9,10,11 A +chr 9,10,12 A +chr 9,11,11 A +chr 9,11,12 A +chr 9,12,12 A +chr 10,10,10 A +chr 10,10,11 A +chr 10,10,12 A +chr 10,11,11 A +chr 10,11,12 A +chr 10,12,12 A +chr 11,11,11 A +chr 11,11,12 A +chr 11,12,12 A +chr 12,12,12 A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/readdepth2seqdepth.out Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,2 @@ +repeat_length read_length informative_read_depth =locus_specific_sequencing_depth =genome_wide_sequencing_depth +10 100 5 10 15
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samplePESAM_2_profile_C.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,5 @@ +M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 shifted 540 713 713 719 719 759 6 GGGGGG +M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 shifted 4007 4082 4082 4088 4088 4258 6 TTTTTT +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 shifted 1849 1930 1930 1936 1936 2100 6 CCCCCC +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 shifted 1849 2025 2025 2030 2030 2100 5 GGGGG +M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 shifted 1428 1517 1517 1522 1522 1543 5 AAAAA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sampleTRgenotypingcorrection Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,2 @@ +chr1 14,13,13,13 A hetero -0.429451855856 13 13 14 +chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sampleTRprofile_C.txt Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,2 @@ +chr1 14,13,13,13 A +chr1 5,6,6,6,6,7,7,8,8 A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samplefq.snoope Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,1 @@ +6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samplefq.snoope.new Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,1 @@ +6 40 54 G 0 SRR345592.75000006_HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sampleprofilegenerator_in Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,6 @@ +9 9 100000 +10 10 91456 +10 9 1259 +11 11 39657 +11 10 1211 +11 12 514
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sampleprofilegenerator_out Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,30 @@ +chr 9,9 A +chr 9,10 A +chr 9,11 A +chr 9,12 A +chr 10,10 A +chr 10,11 A +chr 10,12 A +chr 11,11 A +chr 11,12 A +chr 12,12 A +chr 9,9,9 A +chr 9,9,10 A +chr 9,9,11 A +chr 9,9,12 A +chr 9,10,10 A +chr 9,10,11 A +chr 9,10,12 A +chr 9,11,11 A +chr 9,11,12 A +chr 9,12,12 A +chr 10,10,10 A +chr 10,10,11 A +chr 10,10,12 A +chr 10,11,11 A +chr 10,11,12 A +chr 10,12,12 A +chr 11,11,11 A +chr 11,11,12 A +chr 11,12,12 A +chr 12,12,12 A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samplesortedPESAM_C.sam Wed Apr 22 12:22:50 2015 -0400 @@ -0,0 +1,10 @@ +M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 113 shifted 720 37 40M = 541 -46 TTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACC HHFG@IIHHHHHIHHFHHGFGGGGDBDDEDDDBBB????? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:40 +M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 177 shifted 541 37 173M = 720 46 CTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAAC ::GECC:*:)D<GEGGGECCCEC?00E?::CCCCEEECC:C*GEC4'.>ACGGEC:CC?>><DCE?C:EC?GECE?:CCECGEEC*GEECEC:GEEGE?GGECC:ECA2CC*CCC8DEGGEGC=CGECEAEGEEDGGEDEGD=EBGGGFDHHHHHHHHEEHHHHHIIHFIIHH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:173 +M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 113 shifted 4089 37 170M = 4008 -176 GCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGACACCACGATGCCTGTAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCCGGCAACAATTAATAG GECGGGGGGGGGGGGEGEGGGGD>2GEGGGGGEEGGGGGGGGGGGGGEEECEGEAGGEEGEB>=GGFGEAGHHHEHHHFHFF?ED;HFIHHIIIIHIIHHHHIHHHHIHHHHHHHHIIIIHIHHHHIHHHHHIIHHIIHHIIHIIIIIGGGGGGDDDDDDDDBBB????< XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:170 +M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 177 shifted 4008 37 75M = 4089 176 TGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGC CEGGEEEECC?:EEGECGGGGECGGGGEEGGEEGCCGEGGGGGGGGGGDGGGGGE>EEGGGGGGGGGGGAGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:75 +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 129 shifted 1937 37 164M = 1850 -87 TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT HHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:138T25 +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 65 shifted 1850 37 81M = 1937 87 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGA ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:81 +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 129 shifted 2031 37 70M = 1850 -181 TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT GGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44T25 +M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 65 shifted 1850 37 176M = 2031 181 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGHIIIHHHHHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:176 +M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 129 shifted 1523 37 21M = 1429 -94 GTCTTTAACTCCACCATTAGC GGGEGGEGGGGGCGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:21 +M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 65 shifted 1429 37 89M = 1523 94 CTATGCATCCAACGCGTTGGGAGCTCTCCCATATGGTCGACCTGCAGGCGGCCGCGAATTCACTAGTGATTTCCAAGGACAAATCAGAG ?????BBBDDDDDDDDGGGFGGFEHIIIIIIIHIIIHIHHHHHIIHFHHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGEGEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:89