Mercurial > repos > triasteran > ribogalaxy_umi_processing
changeset 5:e370df93715d draft
Uploaded
author | triasteran |
---|---|
date | Tue, 21 Jun 2022 09:22:37 +0000 |
parents | a580e700aac3 |
children | 1ce4b52212c4 |
files | UMI_riboseq_processing/UMI.py UMI_riboseq_processing/UMI_riboseq.xml UMI_riboseq_processing/UMI_test.py |
diffstat | 3 files changed, 57 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py Tue Jun 21 08:32:44 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -import gzip -from mimetypes import guess_type -from functools import partial -from Bio import SeqIO -from sys import argv, exit - -def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output): - # find wheather its plain or gzipped fastq - encoding = guess_type(pathToFastaFile)[1] # uses file extension - _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open - # output file will be in gz format - output = gzip.open(output,"wt") - # open and parse - with _open(pathToFastaFile) as f: - for record in SeqIO.parse(f, 'fastq'): - lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality - header = lines[0] - seq = lines[1] - sep = lines[2] - qual = lines[3] - trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode - UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN - split_header = header.split(" ") - new_header = split_header[0]+"_"+UMI+" "+split_header[1] - if qual[-1:] == "\n": - new_qual = qual[2:-6]+"\n" - else: - new_qual = qual[2:-6] - output.write(new_header+'\n') - output.write(trimmed_seq) - output.write(sep+'\n') - output.write(new_qual+'\n') - - output.close() - -def main(): - if len(argv) != 3: - exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file") - - # Get paths - pathToFastaFile = argv[1] - output = argv[2] - copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output) - -if __name__ == "__main__": - main()
--- a/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 08:32:44 2022 +0000 +++ b/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 09:22:37 2022 +0000 @@ -1,6 +1,5 @@ -<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.4"> +<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.5"> <requirements> - <requirement type="package" version="1.75">biopython</requirement> </requirements> <command detect_errors="exit_code"> <![CDATA[ python3 '$__tool_directory__/UMI.py' $reads $output ]]>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UMI_riboseq_processing/UMI_test.py Tue Jun 21 09:22:37 2022 +0000 @@ -0,0 +1,56 @@ +import gzip +from mimetypes import guess_type +from functools import partial +from sys import argv, exit +import itertools +from itertools import zip_longest + + +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + + +chunk_size=4 + +def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output): + # find wheather its plain or gzipped fastq + encoding = guess_type(pathToFastaFile)[1] # uses file extension + _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open + # output file will be in gz format + output = gzip.open(output,"wt") + # open and parse + with _open(pathToFastaFile) as f: + for lines in grouper(f, chunk_size, ""): + #lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality + header = lines[0] + seq = lines[1] + sep = lines[2] + qual = lines[3] + trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode + UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN + split_header = header.split(" ") + new_header = split_header[0]+"_"+UMI+" "+split_header[1] + if qual[-1:] == "\n": + new_qual = qual[2:-6]+"\n" + else: + new_qual = qual[2:-6] + output.write(new_header) + output.write(trimmed_seq) + output.write(sep) + output.write(new_qual) + + output.close() + + +def main(): + if len(argv) != 3: + exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file") + + # Get paths + pathToFastaFile = argv[1] + output = argv[2] + copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output) + +if __name__ == "__main__": + main()