Mercurial > repos > davidvanzessen > shm_csr
changeset 96:385dea3c6cb5 draft
planemo upload commit 423a48569c69301fdbf893ac3a649128404dfff5
author | rhpvorderman |
---|---|
date | Fri, 05 Jan 2024 08:53:22 +0000 |
parents | d63eff357515 |
children | fbc6307dd83b |
files | CHANGELOG.md CONTROL_NWK377_PB_IGHC_MID1_40nt_2.txz a.out merge_and_filter.r nt_overview.r sequence_overview.py shm_csr.py shm_csr.xml show_time_as_float tests/__pycache__/test_shm_csr.cpython-37-pytest-6.2.5.pyc tests/__pycache__/test_shm_csr.cpython-39-pytest-7.2.1.pyc tests/__pycache__/test_shm_csr.cpython-39-pytest-7.4.4.pyc tests/data/.~lock.handleiding activeren pas.docx# tests/data/handleiding activeren pas.docx time_ns wget-log wrapper.sh |
diffstat | 17 files changed, 46 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGELOG.md Mon Mar 27 13:11:53 2023 +0000 +++ b/CHANGELOG.md Fri Jan 05 08:53:22 2024 +0000 @@ -1,3 +1,8 @@ +version 1.9.0 +----------------- ++ Add a no filter region filter. ++ Fix a bug where tar would not open certain filenames. + version 1.8.1 ----------------- + Fix a bug where input files with spaces could not be used.
--- a/merge_and_filter.r Mon Mar 27 13:11:53 2023 +0000 +++ b/merge_and_filter.r Fri Jan 05 08:53:22 2024 +0000 @@ -183,6 +183,7 @@ } else if(empty.region.filter == "FR2"){ result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] } +# If empty region filter is None, nothing happens. print(paste("After removal sequences that are missing a gene region:", nrow(result))) filtering.steps = rbind(filtering.steps, c("After removal sequences that are missing a gene region", nrow(result))) @@ -219,7 +220,7 @@ clmns = names(result) if(filter.unique == "remove_vjaa"){ result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA) - } else if(empty.region.filter == "leader"){ + } else if(empty.region.filter == "leader" || empty.region.filter == "None"){ result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) } else if(empty.region.filter == "FR1"){ result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
--- a/nt_overview.r Mon Mar 27 13:11:53 2023 +0000 +++ b/nt_overview.r Fri Jan 05 08:53:22 2024 +0000 @@ -17,7 +17,7 @@ NToverview = merged -if(empty.region.filter == "leader"){ +if(empty.region.filter == "leader" || empty.region.filter == "None"){ NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq) } else if(empty.region.filter == "FR1"){ NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
--- a/sequence_overview.py Mon Mar 27 13:11:53 2023 +0000 +++ b/sequence_overview.py Fri Jan 05 08:53:22 2024 +0000 @@ -101,7 +101,7 @@ sequence_columns = [ "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq"] - if empty_region_filter == "leader": + if empty_region_filter == "leader" or empty_region_filter == "None": sequence_columns = sequence_columns elif empty_region_filter == "FR1": sequence_columns = sequence_columns[1:]
--- a/shm_csr.py Mon Mar 27 13:11:53 2023 +0000 +++ b/shm_csr.py Fri Jan 05 08:53:22 2024 +0000 @@ -2,13 +2,20 @@ import logging import sys import os +import traceback import typing from typing import Optional from collections import defaultdict -REGION_FILTERS = ("leader", "FR1", "CDR1", "FR2", "CDR2") +REGION_FILTERS = ("leader", "FR1", "CDR1", "FR2", "CDR2", "None") + +def int_or_zero(value: typing.Any): + try: + return int(value) + except ValueError: + return 0 class Mutation(typing.NamedTuple): """Represent a mutation type as a tuple""" @@ -177,15 +184,15 @@ mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] - fr1Length = int(linesplt[fr1LengthIndex]) - fr2Length = int(linesplt[fr2LengthIndex]) - fr3Length = int(linesplt[fr3LengthIndex]) - cdr1Length = int(linesplt[cdr1LengthIndex]) - cdr2Length = int(linesplt[cdr2LengthIndex]) + fr1Length = int_or_zero(linesplt[fr1LengthIndex]) + fr2Length = int_or_zero(linesplt[fr2LengthIndex]) + fr3Length = int_or_zero(linesplt[fr3LengthIndex]) + cdr1Length = int_or_zero(linesplt[cdr1LengthIndex]) + cdr2Length = int_or_zero(linesplt[cdr2LengthIndex]) LengthDic[ID] = (fr1Length, cdr1Length, fr2Length, cdr2Length, fr3Length) - cdr1AALengthDic[ID] = int(linesplt[cdr1AALengthIndex]) - cdr2AALengthDic[ID] = int(linesplt[cdr2AALengthIndex]) + cdr1AALengthDic[ID] = int_or_zero(linesplt[cdr1AALengthIndex]) + cdr2AALengthDic[ID] = int_or_zero(linesplt[cdr2AALengthIndex]) IDlist += [ID] print("len(mutationdic) =", len(mutationdic)) @@ -222,6 +229,8 @@ # We determine the position to start summing below. # This returns 0 for leader, 1 for FR1 etc. length_start_pos = REGION_FILTERS.index(empty_region_filter) + if empty_region_filter == "None": + length_start_pos = 0 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") for ID in IDlist:
--- a/shm_csr.xml Mon Mar 27 13:11:53 2023 +0000 +++ b/shm_csr.xml Fri Jan 05 08:53:22 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="shm_csr" name="SHM & CSR pipeline" version="1.8.1" profile="16.04"> +<tool id="shm_csr" name="SHM & CSR pipeline" version="1.9.0" profile="16.04"> <description></description> <requirements> <requirement type="package" version="3.7.1">python</requirement> @@ -24,9 +24,8 @@ <![CDATA[ #import os #set $input=os.path.basename($in_file.name) - ln -s "$in_file" "$input" && #if str ( $filter_unique.filter_unique_select ) == "remove": - $__tool_directory__/wrapper.sh "$input" + $__tool_directory__/wrapper.sh "$in_file" custom $out_file $out_file.files_path "$input" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg @@ -60,6 +59,7 @@ <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option> <option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option> <option value="FR2">FR2: include CDR2,FR3 in filters</option> + <option value="None">No filter: sequences with mission regions are not filtered.</option> </param> <param name="functionality" type="select" label="Functionality filter" help="" > <option value="productive" selected="true">Productive (Productive and Productive see comment)</option>
--- a/tests/data/.~lock.handleiding activeren pas.docx# Mon Mar 27 13:11:53 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Vorderman\, R.H.P. (MOLEPI) ,rhpvorderman,sasc-pc-6,21.02.2023 15:01,file:///home/rhpvorderman/.config/libreoffice/4; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wget-log Fri Jan 05 08:53:22 2024 +0000 @@ -0,0 +1,15 @@ +--2024-01-03 14:15:48-- https://filesender.surf.nl/download.php?token=e17473dc-a342-4bfc-b1ff-5bc3b5fb8bd8 +Herleiden van filesender.surf.nl (filesender.surf.nl)... 2001:610:188:f001:145:101:124:6, 145.101.124.6 +Verbinding maken met filesender.surf.nl (filesender.surf.nl)|2001:610:188:f001:145:101:124:6|:443... verbonden. +HTTP-verzoek is verzonden; wachten op antwoord... 302 Found +Locatie: /?s=exception&exception=eyJtZXNzYWdlIjoiZG93bmxvYWRfbWlzc2luZ19maWxlc19pZHMiLCJ1aWQiOiI2NTk1NWUwNDI2NmFiIiwiZGV0YWlscyI6bnVsbH0= [volgen...] +--2024-01-03 14:15:48-- https://filesender.surf.nl/?s=exception&exception=eyJtZXNzYWdlIjoiZG93bmxvYWRfbWlzc2luZ19maWxlc19pZHMiLCJ1aWQiOiI2NTk1NWUwNDI2NmFiIiwiZGV0YWlscyI6bnVsbH0= +Verbinding met [filesender.surf.nl]:443 wordt hergebruikt. +HTTP-verzoek is verzonden; wachten op antwoord... 200 OK +Lengte: 5324 (5,2K) [text/html] +Wordt opgeslagen als: ‘/home/rhpvorderman/Downloads/tmp/test.tar.xz’ + + /home/rhpvorderman/Downloads/tmp/test.tar.xz 0%[ ] 0 --.-KB/s /home/rhpvorderman/Downloads/tmp/test.tar.xz 100%[=====================================================================================================================================>] 5,20K --.-KB/s in 0s + +2024-01-03 14:15:48 (414 MB/s) - '‘/home/rhpvorderman/Downloads/tmp/test.tar.xz’' opgeslagen [5324/5324] +
--- a/wrapper.sh Mon Mar 27 13:11:53 2023 +0000 +++ b/wrapper.sh Fri Jan 05 08:53:22 2024 +0000 @@ -23,7 +23,7 @@ class_filter=${19} empty_region_filter=${20} fast=${21} -BASENAME=$(basename "$input") +BASENAME=$(basename "$title") # Cut off .txz or .tgz suffix and also replace spaces with underscores. NEW_IMGT_PREFIX="new_IMGT_${BASENAME%.*}" NEW_IMGT_PREFIX=${NEW_IMGT_PREFIX// /_} @@ -31,7 +31,7 @@ #exec 5> debug_output.txt #BASH_XTRACEFD="5" ## Busybox date does not support '+%s.%N'. So use a custom program. Can be -## Compiled with cc -Os show_time_as_float.c -o show_time_as_float +## Compiled with cc -static -Os show_time_as_float.c -o show_time_as_float #PS4='$(${dir}/show_time_as_float) $LINENO: ' #set -x